diff --git a/collector/__tests__/utils/url/index.test.js b/collector/__tests__/utils/url/index.test.js index cb4211b1..4a19b799 100644 --- a/collector/__tests__/utils/url/index.test.js +++ b/collector/__tests__/utils/url/index.test.js @@ -74,8 +74,10 @@ describe("validURL", () => { }); describe("validateURL", () => { - it("should return the exact same URL if it's already valid", () => { - expect(validateURL("https://www.google.com")).toBe("https://www.google.com"); + it("should return the same URL if it's already valid", () => { + expect(validateURL("https://www.google.com")).toBe( + "https://www.google.com" + ); expect(validateURL("http://www.google.com")).toBe("http://www.google.com"); expect(validateURL("https://random")).toBe("https://random"); @@ -88,16 +90,22 @@ describe("validateURL", () => { it("should assume https:// if the URL doesn't have a protocol", () => { expect(validateURL("www.google.com")).toBe("https://www.google.com"); expect(validateURL("google.com")).toBe("https://google.com"); + expect(validateURL("EXAMPLE.com/ABCDEF/q1=UPPER")).toBe("https://example.com/ABCDEF/q1=UPPER"); expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com"); - expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com"); + expect(validateURL("mailto://www.google.com")).toBe( + "mailto://www.google.com" + ); expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com"); expect(validateURL("data://www.google.com")).toBe("data://www.google.com"); }); it("should remove trailing slashes post-validation", () => { - expect(validateURL("https://www.google.com/")).toBe("https://www.google.com"); + expect(validateURL("https://www.google.com/")).toBe( + "https://www.google.com" + ); expect(validateURL("http://www.google.com/")).toBe("http://www.google.com"); expect(validateURL("https://random/")).toBe("https://random"); + expect(validateURL("https://example.com/ABCDEF/")).toBe("https://example.com/ABCDEF"); }); it("should handle edge cases and bad data inputs", () => { @@ -109,4 +117,13 @@ describe("validateURL", () => { expect(validateURL(" ")).toBe(""); expect(validateURL(" look here! ")).toBe("look here!"); }); + + it("should preserve case of characters in URL pathname", () => { + expect(validateURL("https://example.com/To/ResOURce?q1=Value&qZ22=UPPE!R")) + .toBe("https://example.com/To/ResOURce?q1=Value&qZ22=UPPE!R"); + expect(validateURL("https://sample.com/uPeRCaSe")) + .toBe("https://sample.com/uPeRCaSe"); + expect(validateURL("Example.com/PATH/To/Resource?q2=Value&q1=UPPER")) + .toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER"); + }); }); diff --git a/collector/utils/downloadURIToFile/index.js b/collector/utils/downloadURIToFile/index.js index a91a054c..f7326658 100644 --- a/collector/utils/downloadURIToFile/index.js +++ b/collector/utils/downloadURIToFile/index.js @@ -3,6 +3,7 @@ const fs = require("fs"); const path = require("path"); const { pipeline } = require("stream/promises"); const { validURL } = require("../url"); +const { default: slugify } = require("slugify"); /** * Download a file to the hotdir @@ -31,7 +32,12 @@ async function downloadURIToFile(url, maxTimeout = 10_000) { }) .finally(() => clearTimeout(timeout)); - const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url)); + const urlObj = new URL(url); + const filename = `${urlObj.hostname}-${slugify( + urlObj.pathname.replace(/\//g, "-"), + { lower: true } + )}`; + const localFilePath = path.join(WATCH_DIRECTORY, filename); const writeStream = fs.createWriteStream(localFilePath); await pipeline(res.body, writeStream); diff --git a/collector/utils/url/index.js b/collector/utils/url/index.js index 6c98281b..c5a28f71 100644 --- a/collector/utils/url/index.js +++ b/collector/utils/url/index.js @@ -80,14 +80,12 @@ function validURL(url) { */ function validateURL(url) { try { - let destination = url.trim().toLowerCase(); + let destination = url.trim(); // If the URL has a protocol, just pass through - if (destination.includes("://")) { + // If the URL doesn't have a protocol, assume https:// + if (destination.includes("://")) destination = new URL(destination).toString(); - } else { - // If the URL doesn't have a protocol, assume https:// - destination = new URL(`https://${destination.trim()}`).toString(); - } + else destination = new URL(`https://${destination}`).toString(); // If the URL ends with a slash, remove it return destination.endsWith("/") ? destination.slice(0, -1) : destination;