277 lines
6.9 KiB
TypeScript
277 lines
6.9 KiB
TypeScript
// src/services/web-fetcher.ts
|
|
// Copyright (C) 2025 DTP Technologies, LLC
|
|
// All Rights Reserved
|
|
|
|
import env from "../config/env.js";
|
|
|
|
import { chromium, type Browser, type Page } from "playwright";
|
|
import TurndownService from "turndown";
|
|
import { JSDOM } from "jsdom";
|
|
import { Readability } from "@mozilla/readability";
|
|
import crypto from "node:crypto";
|
|
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
|
|
import { DtpService } from "../lib/service.js";
|
|
|
|
export interface FetchResult {
|
|
url: string;
|
|
title: string;
|
|
markdown: string;
|
|
lineCount: number;
|
|
}
|
|
|
|
export class WebFetcherService extends DtpService {
|
|
private turndown: TurndownService;
|
|
private cacheDir: string;
|
|
|
|
get name(): string {
|
|
return "WebFetcherService";
|
|
}
|
|
|
|
get slug(): string {
|
|
return "web-fetcher";
|
|
}
|
|
|
|
constructor() {
|
|
super();
|
|
this.turndown = new TurndownService({
|
|
headingStyle: "atx",
|
|
codeBlockStyle: "fenced",
|
|
hr: "---",
|
|
});
|
|
|
|
// Remove non-informational elements before conversion
|
|
this.turndown.remove([
|
|
"script",
|
|
"style",
|
|
"noscript",
|
|
"nav",
|
|
"footer",
|
|
"header",
|
|
"button",
|
|
"input",
|
|
"form",
|
|
]);
|
|
|
|
// Initialize cache directory
|
|
this.cacheDir = path.join(env.projectRoot, ".gadget-cache");
|
|
}
|
|
|
|
async start(): Promise<void> {
|
|
this.log.info("service started");
|
|
await this.ensureCacheDir();
|
|
}
|
|
|
|
async stop(): Promise<void> {
|
|
this.log.info("service stopped");
|
|
}
|
|
|
|
/**
|
|
* Generates a cache key from a URL
|
|
*/
|
|
private getCacheKey(url: string): string {
|
|
const hash = crypto.createHash("sha256").update(url).digest("hex");
|
|
return hash;
|
|
}
|
|
|
|
/**
|
|
* Gets the cache file path for a URL
|
|
*/
|
|
private getCacheFilePath(url: string): string {
|
|
const cacheKey = this.getCacheKey(url);
|
|
return path.join(this.cacheDir, `${cacheKey}.md`);
|
|
}
|
|
|
|
/**
|
|
* Ensures the cache directory exists
|
|
*/
|
|
private async ensureCacheDir(): Promise<void> {
|
|
await fs.mkdir(this.cacheDir, { recursive: true });
|
|
}
|
|
|
|
/**
|
|
* Reads content from cache if available
|
|
*/
|
|
private async readFromCache(url: string): Promise<FetchResult | null> {
|
|
try {
|
|
const cacheFile = this.getCacheFilePath(url);
|
|
const content = await fs.readFile(cacheFile, "utf-8");
|
|
|
|
// Parse the cached file to extract metadata and markdown
|
|
const lines = content.split("\n");
|
|
let markdownStartIndex = 0;
|
|
|
|
// Skip metadata section (lines starting with ##)
|
|
for (let i = 0; i < lines.length; i++) {
|
|
if (lines[i]?.startsWith("## URL:")) {
|
|
continue;
|
|
} else if (lines[i]?.startsWith("## Title:")) {
|
|
continue;
|
|
} else if (lines[i]?.startsWith("## Fetched:")) {
|
|
markdownStartIndex = i + 2; // Skip the blank line after metadata
|
|
break;
|
|
}
|
|
}
|
|
|
|
const markdownLines = lines.slice(markdownStartIndex);
|
|
const markdown = markdownLines.join("\n");
|
|
const lineCount = markdown.split("\n").length;
|
|
|
|
// Extract title from metadata
|
|
const titleLine = lines.find((l) => l?.startsWith("## Title:"));
|
|
const title = titleLine?.replace("## Title:", "").trim() || "Unknown";
|
|
|
|
return {
|
|
url,
|
|
title,
|
|
markdown,
|
|
lineCount,
|
|
};
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Writes content to cache
|
|
*/
|
|
private async writeToCache(
|
|
url: string,
|
|
title: string,
|
|
markdown: string,
|
|
): Promise<void> {
|
|
await this.ensureCacheDir();
|
|
|
|
const cacheFile = this.getCacheFilePath(url);
|
|
const timestamp = new Date().toISOString();
|
|
|
|
const content = `## URL: ${url}
|
|
## Title: ${title}
|
|
## Fetched: ${timestamp}
|
|
|
|
${markdown}`;
|
|
|
|
await fs.writeFile(cacheFile, content, "utf-8");
|
|
}
|
|
|
|
/**
|
|
* Adds line numbers to markdown text
|
|
*/
|
|
private addLineNumbers(text: string): string {
|
|
return text
|
|
.split("\n")
|
|
.map((line, i) => `${(i + 1).toString().padStart(4, " ")} | ${line}`)
|
|
.join("\n");
|
|
}
|
|
|
|
/**
|
|
* Fetches a URL and returns line-numbered Markdown
|
|
* Uses cache if available, otherwise fetches fresh content
|
|
*/
|
|
async fetchUrl(url: string, useCache: boolean = true): Promise<FetchResult> {
|
|
// Try to read from cache first
|
|
if (useCache) {
|
|
const cached = await this.readFromCache(url);
|
|
if (cached) {
|
|
return cached;
|
|
}
|
|
}
|
|
|
|
const browser: Browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext();
|
|
const page: Page = await context.newPage();
|
|
|
|
try {
|
|
// 1. Navigate and wait for SPA hydration
|
|
await page.goto(url, { waitUntil: "networkidle", timeout: 30000 });
|
|
|
|
const title = await page.title();
|
|
|
|
// 2. Get the full HTML from the browser
|
|
const rawHtml = await page.content();
|
|
|
|
// 3. Use JSDOM to create a DOM instance for Readability
|
|
const doc = new JSDOM(rawHtml, { url });
|
|
|
|
// 4. Extract the "essential" content using Readability
|
|
const reader = new Readability(doc.window.document);
|
|
const article = reader.parse();
|
|
|
|
let htmlContent: string;
|
|
let extractedTitle: string = title;
|
|
|
|
if (article) {
|
|
htmlContent = article.content || "";
|
|
extractedTitle = article.title || title;
|
|
} else {
|
|
// Fallback: extract from <main> or <body>
|
|
htmlContent = await page.evaluate(() => {
|
|
const main = document.querySelector("main") || document.body;
|
|
return main.innerHTML;
|
|
});
|
|
}
|
|
|
|
// 5. Convert to Markdown
|
|
let markdown = this.turndown.turndown(htmlContent);
|
|
|
|
// 6. Apply Line Numbering for Context Offsets
|
|
const numberedMarkdown = this.addLineNumbers(markdown);
|
|
const lineCount = numberedMarkdown.split("\n").length;
|
|
|
|
const result: FetchResult = {
|
|
url,
|
|
title: extractedTitle,
|
|
markdown: numberedMarkdown,
|
|
lineCount,
|
|
};
|
|
|
|
// 7. Cache the result
|
|
if (useCache) {
|
|
await this.writeToCache(url, extractedTitle, numberedMarkdown);
|
|
}
|
|
|
|
return result;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetches content with line range support (like file_read)
|
|
*/
|
|
async fetchUrlWithRange(
|
|
url: string,
|
|
startLine: number = 1,
|
|
endLine?: number,
|
|
useCache: boolean = true,
|
|
): Promise<FetchResult> {
|
|
const result = await this.fetchUrl(url, useCache);
|
|
|
|
if (startLine === 1 && endLine === undefined) {
|
|
return result;
|
|
}
|
|
|
|
const lines = result.markdown.split("\n");
|
|
const startIdx = Math.max(0, startLine - 1);
|
|
const endIdx =
|
|
endLine !== undefined ? Math.min(endLine, lines.length) : lines.length;
|
|
|
|
const selectedLines = lines.slice(startIdx, endIdx);
|
|
const numberedLines = selectedLines
|
|
.map(
|
|
(line, i) =>
|
|
`${startIdx + i + 1}. ${line.substring(line.indexOf("|") + 2)}`,
|
|
)
|
|
.join("\n");
|
|
|
|
return {
|
|
...result,
|
|
markdown: numberedLines,
|
|
lineCount: selectedLines.length,
|
|
};
|
|
}
|
|
}
|
|
|
|
export default new WebFetcherService();
|