gadget/docs/archive/services/web-fetcher.ts

// src/services/web-fetcher.ts
// Copyright (C) 2025 DTP Technologies, LLC
// All Rights Reserved

import env from "../config/env.js";

import { chromium, type Browser, type Page } from "playwright";
import TurndownService from "turndown";
import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";
import crypto from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";

import { DtpService } from "../lib/service.js";

export interface FetchResult {
  url: string;
  title: string;
  markdown: string;
  lineCount: number;
}

export class WebFetcherService extends DtpService {
  private turndown: TurndownService;
  private cacheDir: string;

  get name(): string {
    return "WebFetcherService";
  }

  get slug(): string {
    return "web-fetcher";
  }

  constructor() {
    super();
    this.turndown = new TurndownService({
      headingStyle: "atx",
      codeBlockStyle: "fenced",
      hr: "---",
    });

    // Remove non-informational elements before conversion
    this.turndown.remove([
      "script",
      "style",
      "noscript",
      "nav",
      "footer",
      "header",
      "button",
      "input",
      "form",
    ]);

    // Initialize cache directory
    this.cacheDir = path.join(env.projectRoot, ".gadget-cache");
  }

  async start(): Promise<void> {
    this.log.info("service started");
    await this.ensureCacheDir();
  }

  async stop(): Promise<void> {
    this.log.info("service stopped");
  }

  /**
   * Generates a cache key from a URL
   */
  private getCacheKey(url: string): string {
    const hash = crypto.createHash("sha256").update(url).digest("hex");
    return hash;
  }

  /**
   * Gets the cache file path for a URL
   */
  private getCacheFilePath(url: string): string {
    const cacheKey = this.getCacheKey(url);
    return path.join(this.cacheDir, `${cacheKey}.md`);
  }

  /**
   * Ensures the cache directory exists
   */
  private async ensureCacheDir(): Promise<void> {
    await fs.mkdir(this.cacheDir, { recursive: true });
  }

  /**
   * Reads content from cache if available
   */
  private async readFromCache(url: string): Promise<FetchResult | null> {
    try {
      const cacheFile = this.getCacheFilePath(url);
      const content = await fs.readFile(cacheFile, "utf-8");

      // Parse the cached file to extract metadata and markdown
      const lines = content.split("\n");
      let markdownStartIndex = 0;

      // Skip metadata section (lines starting with ##)
      for (let i = 0; i < lines.length; i++) {
        if (lines[i]?.startsWith("## URL:")) {
          continue;
        } else if (lines[i]?.startsWith("## Title:")) {
          continue;
        } else if (lines[i]?.startsWith("## Fetched:")) {
          markdownStartIndex = i + 2; // Skip the blank line after metadata
          break;
        }
      }

      const markdownLines = lines.slice(markdownStartIndex);
      const markdown = markdownLines.join("\n");
      const lineCount = markdown.split("\n").length;

      // Extract title from metadata
      const titleLine = lines.find((l) => l?.startsWith("## Title:"));
      const title = titleLine?.replace("## Title:", "").trim() || "Unknown";

      return {
        url,
        title,
        markdown,
        lineCount,
      };
    } catch {
      return null;
    }
  }

  /**
   * Writes content to cache
   */
  private async writeToCache(
    url: string,
    title: string,
    markdown: string,
  ): Promise<void> {
    await this.ensureCacheDir();

    const cacheFile = this.getCacheFilePath(url);
    const timestamp = new Date().toISOString();

    const content = `## URL: ${url}
## Title: ${title}
## Fetched: ${timestamp}

${markdown}`;

    await fs.writeFile(cacheFile, content, "utf-8");
  }

  /**
   * Adds line numbers to markdown text
   */
  private addLineNumbers(text: string): string {
    return text
      .split("\n")
      .map((line, i) => `${(i + 1).toString().padStart(4, " ")} | ${line}`)
      .join("\n");
  }

  /**
   * Fetches a URL and returns line-numbered Markdown
   * Uses cache if available, otherwise fetches fresh content
   */
  async fetchUrl(url: string, useCache: boolean = true): Promise<FetchResult> {
    // Try to read from cache first
    if (useCache) {
      const cached = await this.readFromCache(url);
      if (cached) {
        return cached;
      }
    }

    const browser: Browser = await chromium.launch({ headless: true });
    const context = await browser.newContext();
    const page: Page = await context.newPage();

    try {
      // 1. Navigate and wait for SPA hydration
      await page.goto(url, { waitUntil: "networkidle", timeout: 30000 });

      const title = await page.title();

      // 2. Get the full HTML from the browser
      const rawHtml = await page.content();

      // 3. Use JSDOM to create a DOM instance for Readability
      const doc = new JSDOM(rawHtml, { url });

      // 4. Extract the "essential" content using Readability
      const reader = new Readability(doc.window.document);
      const article = reader.parse();

      let htmlContent: string;
      let extractedTitle: string = title;

      if (article) {
        htmlContent = article.content || "";
        extractedTitle = article.title || title;
      } else {
        // Fallback: extract from <main> or <body>
        htmlContent = await page.evaluate(() => {
          const main = document.querySelector("main") || document.body;
          return main.innerHTML;
        });
      }

      // 5. Convert to Markdown
      let markdown = this.turndown.turndown(htmlContent);

      // 6. Apply Line Numbering for Context Offsets
      const numberedMarkdown = this.addLineNumbers(markdown);
      const lineCount = numberedMarkdown.split("\n").length;

      const result: FetchResult = {
        url,
        title: extractedTitle,
        markdown: numberedMarkdown,
        lineCount,
      };

      // 7. Cache the result
      if (useCache) {
        await this.writeToCache(url, extractedTitle, numberedMarkdown);
      }

      return result;
    } finally {
      await browser.close();
    }
  }

  /**
   * Fetches content with line range support (like file_read)
   */
  async fetchUrlWithRange(
    url: string,
    startLine: number = 1,
    endLine?: number,
    useCache: boolean = true,
  ): Promise<FetchResult> {
    const result = await this.fetchUrl(url, useCache);

    if (startLine === 1 && endLine === undefined) {
      return result;
    }

    const lines = result.markdown.split("\n");
    const startIdx = Math.max(0, startLine - 1);
    const endIdx =
      endLine !== undefined ? Math.min(endLine, lines.length) : lines.length;

    const selectedLines = lines.slice(startIdx, endIdx);
    const numberedLines = selectedLines
      .map(
        (line, i) =>
          `${startIdx + i + 1}. ${line.substring(line.indexOf("|") + 2)}`,
      )
      .join("\n");

    return {
      ...result,
      markdown: numberedLines,
      lineCount: selectedLines.length,
    };
  }
}

export default new WebFetcherService();