The code defines three classes: PineconeCrawler, DiskCrawler, and Neo4jCrawler, each utilizing the PuppeteerCrawler to scrape web pages from specified URLs. They process webpage content differently: storing embeddings in a Pinecone database, saving them locally with HNSWLib, and adding page metadata to a Neo4j graph database, respectively.
import { GlobInput, PuppeteerCrawler } from "crawlee";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { PineconeStore } from "langchain/vectorstores/pinecone";
import { getPineconeClient } from "../utils/pinecone.utils";
import { GraphDbService } from "../graph/GraphDbService";
import { HNSWLib } from "langchain/vectorstores/hnswlib";
export class PineconeCrawler {
constructor(private indexName: string, private urls: string[], private globs?: string[]) {}
public async crawl() {
const pinecone = await getPineconeClient();
const embeddings = new OpenAIEmbeddings({ batchSize: 2000 });
const pineconeIndex = pinecone.Index(this.indexName);
const pineconeStore = new PineconeStore(embeddings, { pineconeIndex });
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
const globs = this.globs;
const crawler = new PuppeteerCrawler({
requestHandlerTimeoutSecs: 120,
maxConcurrency: 5,
async requestHandler({ page, enqueueLinks, log }) {
const url = page.url();
const content = await page.content();
const docs = await textSplitter.createDocuments([content], [{ url }]);
await pineconeStore.addDocuments(docs);
log.info(`Added: ${new URL(url).pathname}`);
await enqueueLinks({ globs });
},
});
await crawler.run(this.urls);
console.log("done");
}
}
export class DiskCrawler {
constructor(private urls: string[], private globs?: string[]) {}
public async crawl() {
const db = new HNSWLib(new OpenAIEmbeddings({ batchSize: 2000 }), { numDimensions: 1536, space: "cosine" });
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
const globs = this.globs;
const crawler = new PuppeteerCrawler({
requestHandlerTimeoutSecs: 120,
maxConcurrency: 5,
async requestHandler({ page, enqueueLinks, log }) {
const url = page.url();
const content = await page.content();
const docs = await textSplitter.createDocuments([content], [{ url }]);
await db.addDocuments(docs);
log.info(`Added: ${new URL(url).pathname}`);
await enqueueLinks({ globs });
},
});
await crawler.run(this.urls);
await db.save("./src/chats/vectors");
console.log("done");
}
}
export class Neo4jCrawler {
constructor(private urls: string[], private globs?: GlobInput[]) {}
public async crawl() {
const globs = this.globs ?? this.urls.map(url => `${new URL(url).href}/**`);
const crawler = new PuppeteerCrawler({
async requestHandler({ request, page, enqueueLinks, log }) {
const title = await page.title();
const url = page.url();
const description = await page.evaluate(
() => document.querySelector("head > meta[name='description']")?.getAttribute("content") ?? "",
);
const pageLinks = await page.evaluate(() => {
return [...document.querySelectorAll("a")]
.map(link => link.href)
.filter(href => {
try {
return new URL(href).hostname === location.hostname;
} catch (e) {
return false;
}
});
});
await new GraphDbService().addNode({ url, pageLinks, pageProps: { title, description } });
log.info(`Title of ${request.loadedUrl} is added to graph`);
await enqueueLinks({ globs });
},
});
await crawler.run(this.urls);
console.log("done");
}
}