🦜️🔗 LangChain

Example of how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model.

src/main.js

src/vector_index_cache.js

1/* eslint-disable import/extensions */
2import { rm } from 'node:fs/promises';
3
4import { ApifyDatasetLoader } from '@langchain/community/document_loaders/web/apify_dataset';
5import { HNSWLib } from '@langchain/community/vectorstores/hnswlib';
6import { ChatPromptTemplate } from '@langchain/core/prompts';
7import { OpenAI, OpenAIEmbeddings } from '@langchain/openai';
8import { Actor, log } from 'apify';
9import { createStuffDocumentsChain } from 'langchain/chains/combine_documents';
10import { createRetrievalChain } from 'langchain/chains/retrieval';
11import { Document } from 'langchain/document';
12
13// This is ESM project, and as such, it requires you to specify extensions in your relative imports.
14// Read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
15import { cacheVectorIndex, retrieveVectorIndex } from './vector_index_cache.js';
16
17await Actor.init();
18
19// Follow these steps to run this template:
20// 1. If running locally, authenticate to the Apify platform by executing `apify login` in your terminal.
21//    This is necessary to run the Website Content Crawler Actor for data gathering.
22// 2. Set the `OPENAI_API_KEY` environment variable with your OpenAI API key, which can be obtained from
23//    https://platform.openai.com/account/api-keys. Refer to
24//    https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console for guidance
25//    on setting environment variables.
26const { OPENAI_API_KEY, APIFY_TOKEN } = process.env;
27
28// You can configure the input for the Actor in the Apify UI when running on the Apify platform or editing
29// storage/key_value_stores/default/INPUT.json when running locally.
30const {
31    startUrls = [{ url: 'https://wikipedia.com' }],
32    maxCrawlPages = 3,
33    forceRecrawl = false, // Enforce a re-crawl of website content and re-creation of the vector index.
34    query = 'What is Wikipedia?',
35    openAIApiKey = OPENAI_API_KEY, // This is a fallback to the OPENAI_API_KEY environment variable when value is not present in the input.
36} = (await Actor.getInput()) || {};
37
38// Local directory where the vector index will be stored.
39const VECTOR_INDEX_PATH = './vector_index';
40
41const prompt = ChatPromptTemplate.fromTemplate(
42    `Answer the user's question: {input} based on the following context {context}`,
43);
44
45if (!openAIApiKey)
46    throw new Error('Please configure the OPENAI_API_KEY as environment variable or enter it into the input!');
47if (!APIFY_TOKEN)
48    throw new Error(
49        'Please configure the APIFY_TOKEN environment variable! Call `apify login` in your terminal to authenticate.',
50    );
51
52// Now we want to create a vector index from the crawled documents.
53// Following object represents an input for the https://apify.com/apify/website-content-crawler Actor that crawls the website to gather the data.
54const websiteContentCrawlerInput = { startUrls, maxCrawlPages };
55
56// This variable will contain a vector index that we will use to retrieve the most relevant documents for a given query.
57let vectorStore;
58
59// First, we check if the vector index is already cached. If not, we run the website content crawler to get the documents.
60// By setting up forceRecrawl=true you can enforce a re-scrape of the website content and re-creation of the vector index.
61log.info('Fetching cached vector index from key-value store...');
62const reinitializeIndex = forceRecrawl || !(await retrieveVectorIndex(websiteContentCrawlerInput));
63if (reinitializeIndex) {
64    // Run the Actor, wait for it to finish, and fetch its results from the Apify dataset into a LangChain document loader.
65    log.info('Vector index was not found.');
66    log.info('Running apify/website-content-crawler to gather the data...');
67    const loader = await ApifyDatasetLoader.fromActorCall('apify/website-content-crawler', websiteContentCrawlerInput, {
68        datasetMappingFunction: (item) =>
69            new Document({
70                pageContent: item.text || '',
71                metadata: { source: item.url },
72            }),
73        clientOptions: { token: APIFY_TOKEN },
74    });
75
76    // Initialize the vector index from the crawled documents.
77    log.info('Feeding vector index with crawling results...');
78    const docs = await loader.load();
79    vectorStore = await HNSWLib.fromDocuments(docs, new OpenAIEmbeddings({ openAIApiKey }));
80
81    // Save the vector index to the key-value store so that we can skip this phase in the next run.
82    log.info('Saving vector index to the disk...');
83    await vectorStore.save(VECTOR_INDEX_PATH);
84    await cacheVectorIndex(websiteContentCrawlerInput, VECTOR_INDEX_PATH);
85}
86
87// Load the vector index from the disk if not already initialized above.
88if (!vectorStore) {
89    log.info('Initializing the vector store...');
90    vectorStore = await HNSWLib.load(VECTOR_INDEX_PATH, new OpenAIEmbeddings({ openAIApiKey }));
91}
92
93// Next, create the retrieval chain and enter a query:
94const llm = new OpenAI({ openAIApiKey });
95const combineDocsChain = await createStuffDocumentsChain({
96    llm,
97    prompt,
98});
99
100const chain = await createRetrievalChain({
101    combineDocsChain,
102    retriever: vectorStore.asRetriever(),
103    returnSourceDocuments: true,
104});
105
106log.info('Asking model a question...');
107const res = await chain.invoke({ input: query });
108
109log.info(`Question: ${query}`);
110log.info(`Model response: ${res.answer}`);
111
112// Remove the vector index directory as we have it cached in the key-value store for the next time.
113await rm(VECTOR_INDEX_PATH, { recursive: true });
114
115await Actor.setValue('OUTPUT', res);
116await Actor.exit();

LangChain.js template

LangChain is a framework for developing applications powered by language models.

This example template illustrates how to use LangChain.js with Apify to crawl the web data, vectorize them, and prompt the OpenAI model. All of this is within a single Apify Actor and slightly over a hundred lines of code.

Included features

Apify SDK - a toolkit for building Actors
Input schema - define and easily validate a schema for your Actor's input
Langchain.js - a framework for developing applications powered by language models
OpenAI - a powerful language model

How it works

The code contains the following steps:

Crawls given website using Website Content Crawler Actor.
Vectorizes the data using the OpenAI API.
Caches the vector index in the key-value store so that when you run Actor for the same website again, the cached data are used to speed it up.
Data are fed to the OpenAI model using Langchain.js, and a given query is asked.

Before you start

To be able to run this template both locally and on the Apify platform, you need to:

Have an Apify account and sign into it using apify login command in your terminal. Without this, you won't be able to run the required Website Content Crawler Actor to gather the data.
Have an OpenAI account and an API key. This is needed for vectorizing the data and also to be able to prompt the OpenAI model.
- When running locally store this as OPENAI_API_KEY environment variable (https://docs.apify.com/cli/docs/vars#set-up-environment-variables-in-apify-console).
- When running on Apify platform, you can simply paste this into the input field in the input UI.

Production use

This serves purely as an example of the whole pipeline.

For production use, we recommend you to:

Separate crawling, data vectorization, and prompting into separate Actors. This way, you can run them independently and scale them separately.
Replace the local vector store with Pinecone or a similar database. See the LangChain.js docs for more information.

Resources

Pinecone integration Actor
How to use Pinecone with LLMs
How to use LangChain with OpenAI, Pinecone, and Apify
Integration with Zapier, Make, Google Drive and others
Video guide on getting data using Apify API
A short guide on how to create web scrapers using code templates

Start with JavaScript

Scrape single page with provided URL with Axios and extract data from page's HTML with Cheerio.

Starter

Crawlee + Cheerio

A scraper example that uses Cheerio to parse HTML. It's fast, but it can't run the website's JavaScript or pass JS anti-scraping challenges.

Crawlee + Puppeteer + Chrome

Example of a Puppeteer and headless Chrome web scraper. Headless browsers render JavaScript and are harder to block, but they're slower than plain HTTP.

Crawlee + Playwright + Chrome

Web scraper example with Crawlee, Playwright and headless Chrome. Playwright is more modern, user-friendly and harder to block than Puppeteer.

Crawlee + Playwright + Camoufox

Web scraper example with Crawlee, Playwright and Camoufox. Camoufox is a custom stealthy fork of Firefox. Try this template if you're facing anti-scraping challenges.

Bootstrap CheerioCrawler

Skeleton project that helps you quickly bootstrap `CheerioCrawler` in JavaScript. It's best for developers who already know Apify SDK and Crawlee.

Already have a solution in mind?

Sign up for a free Apify account and deploy your code to the platform in just a few minutes! If you want a head start without coding it yourself, browse our Store of existing solutions.

Import your code Go to store