* fix(collector): infer file extension from Content-Type for URLs without explicit extensions When downloading files from URLs like https://arxiv.org/pdf/2307.10265, the path has no recognizable file extension. The downloaded file gets saved without an extension (or with a nonsensical one like .10265), causing processSingleFile to reject it with 'File extension .10265 not supported for parsing'. Fix: after downloading, check if the filename has a supported file extension. If not, inspect the response Content-Type header and map it to the correct extension using the existing ACCEPTED_MIMES table. For example, a response with Content-Type: application/pdf will cause the file to be saved with a .pdf extension, allowing it to be processed correctly. Fixes #4513 * small refactor --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
91 lines
3.5 KiB
JavaScript
91 lines
3.5 KiB
JavaScript
const { WATCH_DIRECTORY, ACCEPTED_MIMES } = require("../constants");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const { pipeline } = require("stream/promises");
|
|
const { validURL } = require("../url");
|
|
const { default: slugify } = require("slugify");
|
|
|
|
// Add a custom slugify extension for slashing to handle URLs with paths.
|
|
slugify.extend({ "/": "-" });
|
|
|
|
/**
|
|
* Maps a MIME type to the preferred file extension using ACCEPTED_MIMES.
|
|
* Returns null if the MIME type is not recognized or if there are no possible extensions.
|
|
* @param {string} mimeType - The MIME type to resolve (e.g., "application/pdf")
|
|
* @returns {string|null} - The file extension (e.g., ".pdf") or null
|
|
*/
|
|
function mimeToExtension(mimeType) {
|
|
if (!mimeType || !ACCEPTED_MIMES.hasOwnProperty(mimeType)) return null;
|
|
const possibleExtensions = ACCEPTED_MIMES[mimeType] ?? [];
|
|
if (possibleExtensions.length === 0) return null;
|
|
return possibleExtensions[0];
|
|
}
|
|
|
|
/**
|
|
* Download a file to the hotdir
|
|
* @param {string} url - The URL of the file to download
|
|
* @param {number} maxTimeout - The maximum timeout in milliseconds
|
|
* @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
|
|
*/
|
|
async function downloadURIToFile(url, maxTimeout = 10_000) {
|
|
if (!url || typeof url !== "string" || !validURL(url))
|
|
return { success: false, reason: "Not a valid URL.", fileLocation: null };
|
|
|
|
try {
|
|
const abortController = new AbortController();
|
|
const timeout = setTimeout(() => {
|
|
abortController.abort();
|
|
console.error(
|
|
`Timeout ${maxTimeout}ms reached while downloading file for URL:`,
|
|
url.toString()
|
|
);
|
|
}, maxTimeout);
|
|
|
|
const res = await fetch(url, { signal: abortController.signal })
|
|
.then((res) => {
|
|
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
return res;
|
|
})
|
|
.finally(() => clearTimeout(timeout));
|
|
|
|
const urlObj = new URL(url);
|
|
const sluggedPath = slugify(urlObj.pathname, { lower: true });
|
|
let filename = `${urlObj.hostname}-${sluggedPath}`;
|
|
|
|
const existingExt = path.extname(filename).toLowerCase();
|
|
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../constants");
|
|
|
|
// If the filename does not already have a supported file extension,
|
|
// try to infer one from the response Content-Type header.
|
|
// This handles URLs like https://arxiv.org/pdf/2307.10265 where the
|
|
// path has no explicit extension but the server responds with
|
|
// Content-Type: application/pdf.
|
|
if (!SUPPORTED_FILETYPE_CONVERTERS.hasOwnProperty(existingExt)) {
|
|
const { parseContentType } = require("../../processLink/helpers");
|
|
const contentType = parseContentType(res.headers.get("Content-Type"));
|
|
const inferredExt = mimeToExtension(contentType);
|
|
if (inferredExt) {
|
|
console.log(
|
|
`[Collector] URL path has no recognized extension. Inferred ${inferredExt} from Content-Type: ${contentType}`
|
|
);
|
|
filename += inferredExt;
|
|
}
|
|
}
|
|
|
|
const localFilePath = path.join(WATCH_DIRECTORY, filename);
|
|
const writeStream = fs.createWriteStream(localFilePath);
|
|
await pipeline(res.body, writeStream);
|
|
|
|
console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
|
|
return { success: true, fileLocation: localFilePath, reason: null };
|
|
} catch (error) {
|
|
console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
|
|
return { success: false, reason: error.message, fileLocation: null };
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
downloadURIToFile,
|
|
mimeToExtension,
|
|
};
|