feat: use parser to scrape
This commit is contained in:
57
server/src/markdownify/scrape.ts
Normal file
57
server/src/markdownify/scrape.ts
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
import { chromium } from "playwright";
|
||||||
|
import { parseMarkdown } from "./markdown";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
|
* returns clean Markdown using parser.
|
||||||
|
*/
|
||||||
|
export async function convertPageToMarkdown(url: string): Promise<string> {
|
||||||
|
const browser = await chromium.launch();
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto(url, { waitUntil: "networkidle" });
|
||||||
|
|
||||||
|
await page.addInitScript(() => {
|
||||||
|
const selectors = [
|
||||||
|
"script",
|
||||||
|
"style",
|
||||||
|
"link[rel='stylesheet']",
|
||||||
|
"noscript",
|
||||||
|
"meta",
|
||||||
|
"svg",
|
||||||
|
"img",
|
||||||
|
"picture",
|
||||||
|
"source",
|
||||||
|
"video",
|
||||||
|
"audio",
|
||||||
|
"iframe",
|
||||||
|
"object",
|
||||||
|
"embed"
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove inline event handlers (onclick, onload…)
|
||||||
|
const all = document.querySelectorAll("*");
|
||||||
|
all.forEach(el => {
|
||||||
|
[...el.attributes].forEach(attr => {
|
||||||
|
if (attr.name.startsWith("on")) {
|
||||||
|
el.removeAttribute(attr.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Re-extract HTML after cleanup
|
||||||
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
|
return document.documentElement.outerHTML;
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
// Convert cleaned HTML → Markdown
|
||||||
|
const markdown = await parseMarkdown(cleanedHtml || "");
|
||||||
|
return markdown;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user