200.Land

crawl-query-gpt.ts

The code defines three classes: PineconeCrawler, DiskCrawler, and Neo4jCrawler, each utilizing the PuppeteerCrawler to scrape web pages from specified URLs. They process webpage content differently: storing embeddings in a Pinecone database, saving them locally with HNSWLib, and adding page metadata to a Neo4j graph database, respectively.

import { GlobInput, PuppeteerCrawler } from "crawlee";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { PineconeStore } from "langchain/vectorstores/pinecone";
import { getPineconeClient } from "../utils/pinecone.utils";
import { GraphDbService } from "../graph/GraphDbService";
import { HNSWLib } from "langchain/vectorstores/hnswlib";

export class PineconeCrawler {
  constructor(private indexName: string, private urls: string[], private globs?: string[]) {}

  public async crawl() {
    const pinecone = await getPineconeClient();
    const embeddings = new OpenAIEmbeddings({ batchSize: 2000 });
    const pineconeIndex = pinecone.Index(this.indexName);
    const pineconeStore = new PineconeStore(embeddings, { pineconeIndex });
    const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
    const globs = this.globs;

    const crawler = new PuppeteerCrawler({
      requestHandlerTimeoutSecs: 120,
      maxConcurrency: 5,
      async requestHandler({ page, enqueueLinks, log }) {
        const url = page.url();
        const content = await page.content();
        const docs = await textSplitter.createDocuments([content], [{ url }]);
        await pineconeStore.addDocuments(docs);
        log.info(`Added: ${new URL(url).pathname}`);
        await enqueueLinks({ globs });
      },
    });
    await crawler.run(this.urls);
    console.log("done");
  }
}

export class DiskCrawler {
  constructor(private urls: string[], private globs?: string[]) {}

  public async crawl() {
    const db = new HNSWLib(new OpenAIEmbeddings({ batchSize: 2000 }), { numDimensions: 1536, space: "cosine" });
    const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
    const globs = this.globs;

    const crawler = new PuppeteerCrawler({
      requestHandlerTimeoutSecs: 120,
      maxConcurrency: 5,
      async requestHandler({ page, enqueueLinks, log }) {
        const url = page.url();
        const content = await page.content();
        const docs = await textSplitter.createDocuments([content], [{ url }]);
        await db.addDocuments(docs);
        log.info(`Added: ${new URL(url).pathname}`);
        await enqueueLinks({ globs });
      },
    });
    await crawler.run(this.urls);
    await db.save("./src/chats/vectors");
    console.log("done");
  }
}

export class Neo4jCrawler {
  constructor(private urls: string[], private globs?: GlobInput[]) {}
  public async crawl() {
    const globs = this.globs ?? this.urls.map(url => `${new URL(url).href}/**`);
    const crawler = new PuppeteerCrawler({
      async requestHandler({ request, page, enqueueLinks, log }) {
        const title = await page.title();
        const url = page.url();
        const description = await page.evaluate(
          () => document.querySelector("head > meta[name='description']")?.getAttribute("content") ?? "",
        );
        const pageLinks = await page.evaluate(() => {
          return [...document.querySelectorAll("a")]
            .map(link => link.href)
            .filter(href => {
              try {
                return new URL(href).hostname === location.hostname;
              } catch (e) {
                return false;
              }
            });
        });
        await new GraphDbService().addNode({ url, pageLinks, pageProps: { title, description } });
        log.info(`Title of ${request.loadedUrl} is added to graph`);
        await enqueueLinks({ globs });
      },
    });
    await crawler.run(this.urls);
    console.log("done");
  }
}