gadget/docs/archive/services/web-fetcher.ts

277 lines
6.9 KiB
TypeScript

// src/services/web-fetcher.ts
// Copyright (C) 2025 DTP Technologies, LLC
// All Rights Reserved
import env from "../config/env.js";
import { chromium, type Browser, type Page } from "playwright";
import TurndownService from "turndown";
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import crypto from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { DtpService } from "../lib/service.js";
export interface FetchResult {
url: string;
title: string;
markdown: string;
lineCount: number;
}
export class WebFetcherService extends DtpService {
private turndown: TurndownService;
private cacheDir: string;
get name(): string {
return "WebFetcherService";
}
get slug(): string {
return "web-fetcher";
}
constructor() {
super();
this.turndown = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
hr: "---",
});
// Remove non-informational elements before conversion
this.turndown.remove([
"script",
"style",
"noscript",
"nav",
"footer",
"header",
"button",
"input",
"form",
]);
// Initialize cache directory
this.cacheDir = path.join(env.projectRoot, ".gadget-cache");
}
async start(): Promise<void> {
this.log.info("service started");
await this.ensureCacheDir();
}
async stop(): Promise<void> {
this.log.info("service stopped");
}
/**
* Generates a cache key from a URL
*/
private getCacheKey(url: string): string {
const hash = crypto.createHash("sha256").update(url).digest("hex");
return hash;
}
/**
* Gets the cache file path for a URL
*/
private getCacheFilePath(url: string): string {
const cacheKey = this.getCacheKey(url);
return path.join(this.cacheDir, `${cacheKey}.md`);
}
/**
* Ensures the cache directory exists
*/
private async ensureCacheDir(): Promise<void> {
await fs.mkdir(this.cacheDir, { recursive: true });
}
/**
* Reads content from cache if available
*/
private async readFromCache(url: string): Promise<FetchResult | null> {
try {
const cacheFile = this.getCacheFilePath(url);
const content = await fs.readFile(cacheFile, "utf-8");
// Parse the cached file to extract metadata and markdown
const lines = content.split("\n");
let markdownStartIndex = 0;
// Skip metadata section (lines starting with ##)
for (let i = 0; i < lines.length; i++) {
if (lines[i]?.startsWith("## URL:")) {
continue;
} else if (lines[i]?.startsWith("## Title:")) {
continue;
} else if (lines[i]?.startsWith("## Fetched:")) {
markdownStartIndex = i + 2; // Skip the blank line after metadata
break;
}
}
const markdownLines = lines.slice(markdownStartIndex);
const markdown = markdownLines.join("\n");
const lineCount = markdown.split("\n").length;
// Extract title from metadata
const titleLine = lines.find((l) => l?.startsWith("## Title:"));
const title = titleLine?.replace("## Title:", "").trim() || "Unknown";
return {
url,
title,
markdown,
lineCount,
};
} catch {
return null;
}
}
/**
* Writes content to cache
*/
private async writeToCache(
url: string,
title: string,
markdown: string,
): Promise<void> {
await this.ensureCacheDir();
const cacheFile = this.getCacheFilePath(url);
const timestamp = new Date().toISOString();
const content = `## URL: ${url}
## Title: ${title}
## Fetched: ${timestamp}
${markdown}`;
await fs.writeFile(cacheFile, content, "utf-8");
}
/**
* Adds line numbers to markdown text
*/
private addLineNumbers(text: string): string {
return text
.split("\n")
.map((line, i) => `${(i + 1).toString().padStart(4, " ")} | ${line}`)
.join("\n");
}
/**
* Fetches a URL and returns line-numbered Markdown
* Uses cache if available, otherwise fetches fresh content
*/
async fetchUrl(url: string, useCache: boolean = true): Promise<FetchResult> {
// Try to read from cache first
if (useCache) {
const cached = await this.readFromCache(url);
if (cached) {
return cached;
}
}
const browser: Browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
const page: Page = await context.newPage();
try {
// 1. Navigate and wait for SPA hydration
await page.goto(url, { waitUntil: "networkidle", timeout: 30000 });
const title = await page.title();
// 2. Get the full HTML from the browser
const rawHtml = await page.content();
// 3. Use JSDOM to create a DOM instance for Readability
const doc = new JSDOM(rawHtml, { url });
// 4. Extract the "essential" content using Readability
const reader = new Readability(doc.window.document);
const article = reader.parse();
let htmlContent: string;
let extractedTitle: string = title;
if (article) {
htmlContent = article.content || "";
extractedTitle = article.title || title;
} else {
// Fallback: extract from <main> or <body>
htmlContent = await page.evaluate(() => {
const main = document.querySelector("main") || document.body;
return main.innerHTML;
});
}
// 5. Convert to Markdown
let markdown = this.turndown.turndown(htmlContent);
// 6. Apply Line Numbering for Context Offsets
const numberedMarkdown = this.addLineNumbers(markdown);
const lineCount = numberedMarkdown.split("\n").length;
const result: FetchResult = {
url,
title: extractedTitle,
markdown: numberedMarkdown,
lineCount,
};
// 7. Cache the result
if (useCache) {
await this.writeToCache(url, extractedTitle, numberedMarkdown);
}
return result;
} finally {
await browser.close();
}
}
/**
* Fetches content with line range support (like file_read)
*/
async fetchUrlWithRange(
url: string,
startLine: number = 1,
endLine?: number,
useCache: boolean = true,
): Promise<FetchResult> {
const result = await this.fetchUrl(url, useCache);
if (startLine === 1 && endLine === undefined) {
return result;
}
const lines = result.markdown.split("\n");
const startIdx = Math.max(0, startLine - 1);
const endIdx =
endLine !== undefined ? Math.min(endLine, lines.length) : lines.length;
const selectedLines = lines.slice(startIdx, endIdx);
const numberedLines = selectedLines
.map(
(line, i) =>
`${startIdx + i + 1}. ${line.substring(line.indexOf("|") + 2)}`,
)
.join("\n");
return {
...result,
markdown: numberedLines,
lineCount: selectedLines.length,
};
}
}
export default new WebFetcherService();