merlyn/collector/utils/downloadURIToFile/index.js
Yitong Li 2f7a818744
fix(collector): infer file extension from Content-Type for URLs without explicit extensions (#5252)
* fix(collector): infer file extension from Content-Type for URLs without explicit extensions

When downloading files from URLs like https://arxiv.org/pdf/2307.10265,
the path has no recognizable file extension. The downloaded file gets
saved without an extension (or with a nonsensical one like .10265),
causing processSingleFile to reject it with 'File extension .10265
not supported for parsing'.

Fix: after downloading, check if the filename has a supported file
extension. If not, inspect the response Content-Type header and map
it to the correct extension using the existing ACCEPTED_MIMES table.

For example, a response with Content-Type: application/pdf will cause
the file to be saved with a .pdf extension, allowing it to be processed
correctly.

Fixes #4513

* small refactor

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-03-23 09:40:22 -07:00

91 lines
3.5 KiB
JavaScript

const { WATCH_DIRECTORY, ACCEPTED_MIMES } = require("../constants");
const fs = require("fs");
const path = require("path");
const { pipeline } = require("stream/promises");
const { validURL } = require("../url");
const { default: slugify } = require("slugify");
// Add a custom slugify extension for slashing to handle URLs with paths.
slugify.extend({ "/": "-" });
/**
* Maps a MIME type to the preferred file extension using ACCEPTED_MIMES.
* Returns null if the MIME type is not recognized or if there are no possible extensions.
* @param {string} mimeType - The MIME type to resolve (e.g., "application/pdf")
* @returns {string|null} - The file extension (e.g., ".pdf") or null
*/
function mimeToExtension(mimeType) {
if (!mimeType || !ACCEPTED_MIMES.hasOwnProperty(mimeType)) return null;
const possibleExtensions = ACCEPTED_MIMES[mimeType] ?? [];
if (possibleExtensions.length === 0) return null;
return possibleExtensions[0];
}
/**
* Download a file to the hotdir
* @param {string} url - The URL of the file to download
* @param {number} maxTimeout - The maximum timeout in milliseconds
* @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
*/
async function downloadURIToFile(url, maxTimeout = 10_000) {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", fileLocation: null };
try {
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error(
`Timeout ${maxTimeout}ms reached while downloading file for URL:`,
url.toString()
);
}, maxTimeout);
const res = await fetch(url, { signal: abortController.signal })
.then((res) => {
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
return res;
})
.finally(() => clearTimeout(timeout));
const urlObj = new URL(url);
const sluggedPath = slugify(urlObj.pathname, { lower: true });
let filename = `${urlObj.hostname}-${sluggedPath}`;
const existingExt = path.extname(filename).toLowerCase();
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../constants");
// If the filename does not already have a supported file extension,
// try to infer one from the response Content-Type header.
// This handles URLs like https://arxiv.org/pdf/2307.10265 where the
// path has no explicit extension but the server responds with
// Content-Type: application/pdf.
if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) {
const { parseContentType } = require("../../processLink/helpers");
const contentType = parseContentType(res.headers.get("Content-Type"));
const inferredExt = mimeToExtension(contentType);
if (inferredExt) {
console.log(
`[Collector] URL path has no recognized extension. Inferred ${inferredExt} from Content-Type: ${contentType}`
);
filename += inferredExt;
}
}
const localFilePath = path.join(WATCH_DIRECTORY, filename);
const writeStream = fs.createWriteStream(localFilePath);
await pipeline(res.body, writeStream);
console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
return { success: true, fileLocation: localFilePath, reason: null };
} catch (error) {
console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
return { success: false, reason: error.message, fileLocation: null };
}
}
module.exports = {
downloadURIToFile,
mimeToExtension,
};