Fix: File pulling fails with uppercase URL characters (#4516)

* fix: remove unnecessary toLowerCase in URL validation * test: enhance URL validation tests to preserve case sensitivity and format * test: update URL validation tests to ensure domain normalization to lowercase while preserving path case * small formatting * fix filenames when downloading live URI --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-10-08 14:00:02 -07:00 · 2025-10-08 14:00:02 -07:00 · d48c76919c
commit d48c76919c
parent 4e6f0b33ab
3 changed files with 32 additions and 11 deletions
--- a/collector/tests/utils/url/index.test.js
+++ b/collector/tests/utils/url/index.test.js
@ -74,8 +74,10 @@ describe("validURL", () => {
 });
 describe("validateURL", () => {
-  it("should return the exact same URL if it's already valid", () => {
+  it("should return the same URL if it's already valid", () => {
-    expect(validateURL("https://www.google.com")).toBe("https://www.google.com");
+    expect(validateURL("https://www.google.com")).toBe(
      "https://www.google.com"
    );
    expect(validateURL("http://www.google.com")).toBe("http://www.google.com");
    expect(validateURL("https://random")).toBe("https://random");
@ -88,16 +90,22 @@ describe("validateURL", () => {
  it("should assume https:// if the URL doesn't have a protocol", () => {
    expect(validateURL("www.google.com")).toBe("https://www.google.com");
    expect(validateURL("google.com")).toBe("https://google.com");
    expect(validateURL("EXAMPLE.com/ABCDEF/q1=UPPER")).toBe("https://example.com/ABCDEF/q1=UPPER");
    expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com");
-    expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com");
+    expect(validateURL("mailto://www.google.com")).toBe(
      "mailto://www.google.com"
    );
    expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com");
    expect(validateURL("data://www.google.com")).toBe("data://www.google.com");
  });
  it("should remove trailing slashes post-validation", () => {
-    expect(validateURL("https://www.google.com/")).toBe("https://www.google.com");
+    expect(validateURL("https://www.google.com/")).toBe(
      "https://www.google.com"
    );
    expect(validateURL("http://www.google.com/")).toBe("http://www.google.com");
    expect(validateURL("https://random/")).toBe("https://random");
    expect(validateURL("https://example.com/ABCDEF/")).toBe("https://example.com/ABCDEF");
  });
  it("should handle edge cases and bad data inputs", () => {
@ -109,4 +117,13 @@ describe("validateURL", () => {
    expect(validateURL(" ")).toBe("");
    expect(validateURL(" look here! ")).toBe("look here!");
  });
  it("should preserve case of characters in URL pathname", () => {
    expect(validateURL("https://example.com/To/ResOURce?q1=Value&qZ22=UPPE!R"))
      .toBe("https://example.com/To/ResOURce?q1=Value&qZ22=UPPE!R");
    expect(validateURL("https://sample.com/uPeRCaSe"))
      .toBe("https://sample.com/uPeRCaSe");
    expect(validateURL("Example.com/PATH/To/Resource?q2=Value&q1=UPPER"))
      .toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER");
  });
 });
--- a/collector/utils/downloadURIToFile/index.js
+++ b/collector/utils/downloadURIToFile/index.js
@ -3,6 +3,7 @@ const fs = require("fs");
 const path = require("path");
 const { pipeline } = require("stream/promises");
 const { validURL } = require("../url");
 const { default: slugify } = require("slugify");
 /**
 * Download a file to the hotdir
@ -31,7 +32,12 @@ async function downloadURIToFile(url, maxTimeout = 10_000) {
      })
      .finally(() => clearTimeout(timeout));
-    const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
+    const urlObj = new URL(url);
    const filename = `${urlObj.hostname}-${slugify(
      urlObj.pathname.replace(/\//g, "-"),
      { lower: true }
    )}`;
    const localFilePath = path.join(WATCH_DIRECTORY, filename);
    const writeStream = fs.createWriteStream(localFilePath);
    await pipeline(res.body, writeStream);
--- a/collector/utils/url/index.js
+++ b/collector/utils/url/index.js
@ -80,14 +80,12 @@ function validURL(url) {
 */
 function validateURL(url) {
  try {
-    let destination = url.trim().toLowerCase();
+    let destination = url.trim();
    // If the URL has a protocol, just pass through
    if (destination.includes("://")) {
      destination = new URL(destination).toString();
    } else {
    // If the URL doesn't have a protocol, assume https://
-      destination = new URL(`https://${destination.trim()}`).toString();
+    if (destination.includes("://"))
-    }
+      destination = new URL(destination).toString();
    else destination = new URL(`https://${destination}`).toString();
    // If the URL ends with a slash, remove it
    return destination.endsWith("/") ? destination.slice(0, -1) : destination;