// src/services/web-fetcher.ts // Copyright (C) 2025 DTP Technologies, LLC // All Rights Reserved import env from "../config/env.js"; import { chromium, type Browser, type Page } from "playwright"; import TurndownService from "turndown"; import { JSDOM } from "jsdom"; import { Readability } from "@mozilla/readability"; import crypto from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; import { DtpService } from "../lib/service.js"; export interface FetchResult { url: string; title: string; markdown: string; lineCount: number; } export class WebFetcherService extends DtpService { private turndown: TurndownService; private cacheDir: string; get name(): string { return "WebFetcherService"; } get slug(): string { return "web-fetcher"; } constructor() { super(); this.turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", hr: "---", }); // Remove non-informational elements before conversion this.turndown.remove([ "script", "style", "noscript", "nav", "footer", "header", "button", "input", "form", ]); // Initialize cache directory this.cacheDir = path.join(env.projectRoot, ".gadget-cache"); } async start(): Promise { this.log.info("service started"); await this.ensureCacheDir(); } async stop(): Promise { this.log.info("service stopped"); } /** * Generates a cache key from a URL */ private getCacheKey(url: string): string { const hash = crypto.createHash("sha256").update(url).digest("hex"); return hash; } /** * Gets the cache file path for a URL */ private getCacheFilePath(url: string): string { const cacheKey = this.getCacheKey(url); return path.join(this.cacheDir, `${cacheKey}.md`); } /** * Ensures the cache directory exists */ private async ensureCacheDir(): Promise { await fs.mkdir(this.cacheDir, { recursive: true }); } /** * Reads content from cache if available */ private async readFromCache(url: string): Promise { try { const cacheFile = this.getCacheFilePath(url); const content = await fs.readFile(cacheFile, "utf-8"); // Parse the cached file to extract metadata and markdown const lines = content.split("\n"); let markdownStartIndex = 0; // Skip metadata section (lines starting with ##) for (let i = 0; i < lines.length; i++) { if (lines[i]?.startsWith("## URL:")) { continue; } else if (lines[i]?.startsWith("## Title:")) { continue; } else if (lines[i]?.startsWith("## Fetched:")) { markdownStartIndex = i + 2; // Skip the blank line after metadata break; } } const markdownLines = lines.slice(markdownStartIndex); const markdown = markdownLines.join("\n"); const lineCount = markdown.split("\n").length; // Extract title from metadata const titleLine = lines.find((l) => l?.startsWith("## Title:")); const title = titleLine?.replace("## Title:", "").trim() || "Unknown"; return { url, title, markdown, lineCount, }; } catch { return null; } } /** * Writes content to cache */ private async writeToCache( url: string, title: string, markdown: string, ): Promise { await this.ensureCacheDir(); const cacheFile = this.getCacheFilePath(url); const timestamp = new Date().toISOString(); const content = `## URL: ${url} ## Title: ${title} ## Fetched: ${timestamp} ${markdown}`; await fs.writeFile(cacheFile, content, "utf-8"); } /** * Adds line numbers to markdown text */ private addLineNumbers(text: string): string { return text .split("\n") .map((line, i) => `${(i + 1).toString().padStart(4, " ")} | ${line}`) .join("\n"); } /** * Fetches a URL and returns line-numbered Markdown * Uses cache if available, otherwise fetches fresh content */ async fetchUrl(url: string, useCache: boolean = true): Promise { // Try to read from cache first if (useCache) { const cached = await this.readFromCache(url); if (cached) { return cached; } } const browser: Browser = await chromium.launch({ headless: true }); const context = await browser.newContext(); const page: Page = await context.newPage(); try { // 1. Navigate and wait for SPA hydration await page.goto(url, { waitUntil: "networkidle", timeout: 30000 }); const title = await page.title(); // 2. Get the full HTML from the browser const rawHtml = await page.content(); // 3. Use JSDOM to create a DOM instance for Readability const doc = new JSDOM(rawHtml, { url }); // 4. Extract the "essential" content using Readability const reader = new Readability(doc.window.document); const article = reader.parse(); let htmlContent: string; let extractedTitle: string = title; if (article) { htmlContent = article.content || ""; extractedTitle = article.title || title; } else { // Fallback: extract from
or htmlContent = await page.evaluate(() => { const main = document.querySelector("main") || document.body; return main.innerHTML; }); } // 5. Convert to Markdown let markdown = this.turndown.turndown(htmlContent); // 6. Apply Line Numbering for Context Offsets const numberedMarkdown = this.addLineNumbers(markdown); const lineCount = numberedMarkdown.split("\n").length; const result: FetchResult = { url, title: extractedTitle, markdown: numberedMarkdown, lineCount, }; // 7. Cache the result if (useCache) { await this.writeToCache(url, extractedTitle, numberedMarkdown); } return result; } finally { await browser.close(); } } /** * Fetches content with line range support (like file_read) */ async fetchUrlWithRange( url: string, startLine: number = 1, endLine?: number, useCache: boolean = true, ): Promise { const result = await this.fetchUrl(url, useCache); if (startLine === 1 && endLine === undefined) { return result; } const lines = result.markdown.split("\n"); const startIdx = Math.max(0, startLine - 1); const endIdx = endLine !== undefined ? Math.min(endLine, lines.length) : lines.length; const selectedLines = lines.slice(startIdx, endIdx); const numberedLines = selectedLines .map( (line, i) => `${startIdx + i + 1}. ${line.substring(line.indexOf("|") + 2)}`, ) .join("\n"); return { ...result, markdown: numberedLines, lineCount: selectedLines.length, }; } } export default new WebFetcherService();