File size: 1,660 Bytes
6434339 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import { JSDOM, VirtualConsole } from "jsdom";
function removeTags(node: Node) {
if (node.hasChildNodes()) {
node.childNodes.forEach((childNode) => {
if (node.nodeName === "SCRIPT" || node.nodeName === "STYLE") {
node.removeChild(childNode);
} else {
removeTags(childNode);
}
});
}
}
function naiveInnerText(node: Node): string {
const Node = node; // We need Node(DOM's Node) for the constants, but Node doesn't exist in the nodejs global space, and any Node instance references the constants through the prototype chain
return [...node.childNodes]
.map((childNode) => {
switch (childNode.nodeType) {
case Node.TEXT_NODE:
return node.textContent;
case Node.ELEMENT_NODE:
return naiveInnerText(childNode);
default:
return "";
}
})
.join("\n");
}
export async function parseWeb(url: string) {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const htmlString = await fetch(url, { signal: abortController.signal })
.then((response) => response.text())
.catch((err) => console.log(err));
const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
});
// put the html string into a DOM
const dom = new JSDOM(htmlString ?? "", {
virtualConsole,
});
const body = dom.window.document.querySelector("body");
if (!body) throw new Error("body of the webpage is null");
removeTags(body);
// recursively extract text content from the body and then remove newlines and multiple spaces
const text = (naiveInnerText(body) ?? "").replace(/ {2}|\r\n|\n|\r/gm, "");
return text;
}
|