LLM Dataset Processor
Try for free
No credit card required
Go to Store
This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?
See alternative ActorsLLM Dataset Processor
dusan.vystrcil/llm-dataset-processor
Try for free
No credit card required
Allows you to process whole dataset with single LLM prompt. It's useful if you need to enrich data, summarize content, extract specific information, or manipulate data in a structured way using AI.
.dockerignore
1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "root": true,
3 "env": {
4 "browser": true,
5 "es2020": true,
6 "node": true
7 },
8 "extends": [
9 "@apify/eslint-config-ts"
10 ],
11 "parserOptions": {
12 "project": "./tsconfig.json",
13 "ecmaVersion": 2020
14 },
15 "ignorePatterns": [
16 "node_modules",
17 "dist",
18 "**/*.d.ts"
19 ]
20}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage
10.env
11
12# Added by Apify CLI
13.venv
CHANGELOG.md
1### 0.0.45 (2024-12-10)
2- First version
package.json
1{
2 "name": "LLMDatasetTest",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is a boilerplate of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "@anthropic-ai/sdk": "^0.32.1",
11 "@google/generative-ai": "^0.21.0",
12 "apify": "^3.2.6",
13 "crawlee": "^3.11.5",
14 "openai": "^4.76.0"
15 },
16 "devDependencies": {
17 "@apify/eslint-config-ts": "^0.3.0",
18 "@apify/tsconfig": "^0.1.0",
19 "@typescript-eslint/eslint-plugin": "^7.18.0",
20 "@typescript-eslint/parser": "^7.18.0",
21 "eslint": "^8.50.0",
22 "tsx": "^4.6.2",
23 "typescript": "^5.3.3"
24 },
25 "scripts": {
26 "start": "npm run start:dev",
27 "start:prod": "node dist/main.js",
28 "start:dev": "tsx src/main.ts",
29 "build": "tsc",
30 "lint": "eslint ./src --ext .ts",
31 "lint:fix": "eslint ./src --ext .ts --fix",
32 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
33 },
34 "author": "It's not you it's me",
35 "license": "ISC"
36}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "NodeNext",
5 "moduleResolution": "NodeNext",
6 "target": "ES2022",
7 "outDir": "dist",
8 "noUnusedLocals": false,
9 "skipLibCheck": true,
10 "lib": ["DOM"]
11 },
12 "include": [
13 "./src/**/*"
14 ]
15}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38 && npm install --omit=dev --omit=optional \
39 && echo "Installed NPM packages:" \
40 && (npm list --omit=dev --all || true) \
41 && echo "Node.js version:" \
42 && node --version \
43 && echo "NPM version:" \
44 && npm --version \
45 && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "llm-dataset-processor",
4 "title": "LLM Dataset Processor",
5 "description": "Process or enrich datasets with LLM-generated content.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "ts-crawlee-cheerio"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "LLM Dataset Processor",
3 "description": "Choose specific dataset to process, select LLM, provide API token and craft your prompt template. We recommend to test your prompt first by enabling `Test Prompt Mode`. ",
4 "type": "object",
5 "schemaVersion": 1,
6 "required": [
7 "inputDatasetId",
8 "llmApiToken",
9 "prompt",
10 "model",
11 "temperature",
12 "maxTokens"
13 ],
14 "properties": {
15 "inputDatasetId": {
16 "type": "string",
17 "title": "Input Dataset ID",
18 "description": "The ID of the dataset to process.",
19 "resourceType": "dataset"
20 },
21 "model": {
22 "type": "string",
23 "title": "Large Language Model",
24 "description": "The LLM to use for processing. Each model has different capabilities and pricing. GPT-4o-mini and Claude 3.5 Haiku are recommended for cost-effective processing, while models like Claude 3 Opus or GPT-4o offer higher quality but at a higher cost.",
25 "editor": "select",
26 "enumTitles": ["GPT-4o mini (Recommended)", "GPT-4o", "Claude 3.5 Haiku (Recommended)", "Claude 3.5 Sonnet", "Claude 3 Opus", "Gemini 1.5 Flash", "Gemini 1.5 Flash-8B (Recommended)" ,"Gemini 1.5 Pro"],
27 "enum": ["gpt-4o-mini", "gpt-4o", "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest", "gemini-1.5-flash", "gemini-1.5-flash-8b", "gemini-1.5-pro"]
28 },
29 "llmApiToken": {
30 "type": "string",
31 "title": "LLM Provider API Token",
32 "editor": "textfield",
33 "description": "Your API token for the LLM Provider (e.g., OpenAI).",
34 "isSecret": true
35 },
36 "temperature": {
37
38 "type": "string",
39 "title": "Temperature",
40 "editor": "textfield",
41 "description": "Sampling temperature for the LLM API (controls randomness). We recommend to use a value closer to 0 for exact results. In case of more 'creative' results, we recommend to use a value closer to 1.",
42 "default": "0.1"
43 },
44 "multipleColumns": {
45 "type": "boolean",
46 "title": "Multiple columns in output",
47 "description": "When enabled, instructs the LLM to return responses as JSON objects, creating multiple columns in the output dataset. The columns need to be named and described in the prompt. If disabled, responses are stored in a single `llmresponse` column.",
48 "default": false
49 },
50 "prompt": {
51 "type": "string",
52 "title": "Prompt",
53 "editor": "textarea",
54 "minLength": 1,
55 "description": "The prompt template to send to the LLM API. \n\nUse `{{fieldName}}` placeholders to insert values from the input dataset (e.g., `Summarize this text: {{content.text}}`). For multiple columns output, ensure your prompt contains the names and descriptions of the desired columns in output.\n\n See README for more details.",
56 "prefill": "Summarize this text: {{text}}"
57 },
58 "skipItemIfEmpty": {
59 "type": "boolean",
60 "title": "Skip item if one or more {{field}} are empty",
61 "description": "When enabled, items will be skipped if any {{field}} referenced in the prompt is empty, null, undefined, or contains only whitespace. This helps prevent processing incomplete data.",
62 "default": true
63 },
64 "maxTokens": {
65 "type": "integer",
66 "title": "Max Tokens",
67 "editor": "number",
68 "description": "Maximum number of tokens in the LLM API response.",
69 "default": 150
70 },
71 "testPrompt": {
72 "type": "boolean",
73 "title": "Test Prompt Mode",
74 "description": "Test mode that processes only a limited number of items (defined by `testItemsCount`). Use this to validate your prompt and configuration before running on the full dataset. We highly recommend enabling this option first to validate your prompt because of ambiguity of the LLM responses.",
75 "default": true
76 },
77 "testItemsCount": {
78 "type": "integer",
79 "title": "Test Items Count",
80 "description": "Number of items to process when `Test Prompt Mode` is enabled.",
81 "default": 3,
82 "minimum": 1
83 }
84 }
85}
src/main.ts
1// main.ts
2
3import { Actor, log } from 'apify';
4import { OpenAIProvider, AnthropicProvider, GoogleProvider, getProvider } from './providers/index.js';
5import { Input, OutputItem } from './types.js';
6
7// Rate limits for OpenAI API lowest tier
8const RATE_LIMIT_PER_MINUTE = 500;
9const REQUEST_INTERVAL_MS = Math.ceil(60000 / RATE_LIMIT_PER_MINUTE); // Interval between requests in ms
10
11await Actor.init();
12
13try {
14 // Get the input
15 const input: Input | null = await Actor.getInput();
16
17 if (!input) {
18 throw new Error('No input provided. Please provide the necessary input parameters.');
19 }
20
21 const {
22 llmApiToken,
23 prompt,
24 model,
25 temperature,
26 maxTokens,
27 skipItemIfEmpty,
28 multipleColumns = false,
29 provider: explicitProvider,
30 testPrompt = false,
31 testItemsCount = 3,
32 } = input;
33
34 const inputDatasetId = input?.inputDatasetId || input?.defaultDatasetId;
35
36 if (!inputDatasetId) {
37 throw new Error('No inputDatasetId provided.');
38 }
39
40 // Log configuration details
41 const configDetails = {
42 datasetId: inputDatasetId,
43 model: model,
44 promptTemplate: prompt,
45 multipleColumns
46 };
47 log.info('Configuration details:', configDetails);
48
49 // Helper function to get nested field value using dot notation
50 function getNestedValue(obj: any, path: string): any {
51 return path.split('.').reduce((current, key) => current && current[key], obj);
52 }
53
54 // Helper function to check if a value is empty
55 function isEmpty(value: any): boolean {
56 if (value === undefined || value === null) return true;
57 if (typeof value === 'string') return value.trim() === '';
58 if (Array.isArray(value)) return value.length === 0;
59 if (typeof value === 'object') return Object.keys(value).length === 0;
60 return false;
61 }
62
63 // Helper function to check if any placeholder field is empty
64 function hasEmptyFields(promptStr: string, item: OutputItem): boolean {
65 const fieldMatches = promptStr.match(/\{\{([^}]+)\}\}/g) || [];
66 return fieldMatches.some(match => {
67 const field = match.slice(2, -2).trim(); // Remove {{ and }}
68 const value = getNestedValue(item, field);
69 return isEmpty(value);
70 });
71 }
72
73 // Helper function to replace field placeholders in prompt with actual values
74 function replacePlaceholders(promptStr: string, item: OutputItem): string {
75 return promptStr.replace(/\{\{([^}]+)\}\}/g, (_match, fieldName: string) => {
76 const value = getNestedValue(item, fieldName.trim());
77 return value !== undefined ? String(value) : '';
78 });
79 }
80
81 // Build the final prompt:
82 // If multipleColumns is true, we instruct the LLM to return a strict JSON object.
83 function buildFinalPrompt(promptText: string): string {
84 if (!multipleColumns) {
85 return promptText;
86 }
87
88 // Append clear instructions to return JSON only
89 return `${promptText}
90
91Important: Return only a strict JSON object with the requested fields as keys. No extra text or explanations, no markdown, just JSON.`;
92 }
93
94 // Fetch items from the input dataset
95 let items: OutputItem[] = [];
96 try {
97 // First check if dataset exists
98 const dataset = await Actor.apifyClient.dataset(inputDatasetId).get();
99 if (!dataset) {
100 throw new Error(`Dataset with ID ${inputDatasetId} does not exist`);
101 }
102
103 const inputDataset = await Actor.openDataset<OutputItem>(inputDatasetId);
104 const { items: fetchedItems } = await inputDataset.getData();
105 items = fetchedItems;
106
107 // If test mode is enabled, limit the number of items
108 if (testPrompt) {
109 const itemCount = Math.min(testItemsCount, items.length);
110 items = items.slice(0, itemCount);
111 log.info(`Test mode enabled - processing ${itemCount} items out of ${fetchedItems.length}`);
112 } else {
113 log.info(`Fetched ${items.length} items from the input dataset.`);
114 }
115 } catch (datasetError: unknown) {
116 if (datasetError instanceof Error) {
117 log.error(`Error accessing dataset: ${datasetError.message}`);
118 } else {
119 log.error('Error accessing dataset: Unknown error occurred');
120 }
121 throw datasetError;
122 }
123
124 // Initialize API clients
125 const providers = {
126 openai: new OpenAIProvider(llmApiToken),
127 anthropic: new AnthropicProvider(llmApiToken),
128 google: new GoogleProvider(llmApiToken),
129 };
130
131 // Convert temperature from string to number
132 const temperatureNum = parseFloat(temperature);
133
134 // If multipleColumns is true, we can do a validation step with a sample item.
135 async function validateJsonFormat(testItem: OutputItem): Promise<boolean> {
136 if (!multipleColumns) return true; // No need to validate if single column.
137
138 const provider = getProvider(model, explicitProvider);
139 let finalPrompt = replacePlaceholders(buildFinalPrompt(prompt), testItem);
140
141 for (let attempt = 1; attempt <= 3; attempt++) {
142 try {
143 const testResponse = await providers[provider].call(
144 finalPrompt,
145 model,
146 temperatureNum,
147 maxTokens
148 );
149
150 // First check if we got an empty response
151 if (!testResponse) {
152 log.error('Empty response received from the API');
153 throw new Error('Empty response received from the API');
154 }
155
156 // Try parsing as JSON:
157 try {
158 JSON.parse(testResponse);
159 return true; // JSON parsed successfully
160 } catch (jsonError) {
161 if (attempt < 3) {
162 log.warning(`JSON validation attempt ${attempt} failed. Retrying...`);
163 log.debug('Response that failed JSON parsing:', { response: testResponse });
164 finalPrompt = `${finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
165 } else {
166 log.error('JSON validation attempts exhausted. The prompt may not produce valid JSON.');
167 log.debug('Final response that failed JSON parsing:', { response: testResponse });
168 return false;
169 }
170 }
171 } catch (apiError: any) {
172 // Log the full error for debugging
173 log.error('API call failed:', {
174 error: apiError.message,
175 type: apiError.type,
176 code: apiError.code,
177 param: apiError.param
178 });
179
180 // Rethrow API errors immediately instead of retrying
181 throw apiError;
182 }
183 }
184 return false;
185 }
186
187 if (items.length > 0) {
188 const validationResult = await validateJsonFormat(items[0]);
189 if (multipleColumns && !validationResult) {
190 throw new Error('Failed to produce valid JSON after multiple attempts. Please adjust your prompt or disable multiple columns.');
191 }
192 }
193
194 // Process each item
195 for (let i = 0; i < items.length; i++) {
196 const item = items[i];
197
198 try {
199 // Skip if any required fields are empty and skipItemIfEmpty is true
200 if (skipItemIfEmpty && hasEmptyFields(prompt, item)) {
201 log.info(`Skipping item ${i + 1} due to empty fields`);
202 continue;
203 }
204
205 // Replace placeholders in the prompt
206 let finalPrompt = replacePlaceholders(buildFinalPrompt(prompt), item);
207 log.info(`Processing item ${i + 1}/${items.length}`, { prompt: finalPrompt });
208
209 // Determine the provider and make the API call
210 const provider = getProvider(model, explicitProvider); // returns 'openai' | 'anthropic' | 'google'
211 let llmresponse = await providers[provider].call(
212 finalPrompt,
213 model,
214 temperatureNum,
215 maxTokens
216 );
217
218 log.info(`Item ${i + 1} response:`, { response: llmresponse });
219
220 if (multipleColumns) {
221 let parsedData: any;
222 let attemptsLeft = 2; // After initial call, try parsing or retrying up to 2 times more if needed
223 let currentResponse = llmresponse;
224 let success = false;
225
226 while (attemptsLeft >= 0) {
227 try {
228 parsedData = JSON.parse(currentResponse);
229 success = true;
230 break;
231 } catch (err) {
232 if (attemptsLeft > 0) {
233 // Retry by asking again for correct JSON
234 log.warning(`Failed to parse JSON for item ${i + 1}. Retrying...`);
235 const retryPrompt = `${finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
236 const retryResponse = await providers[provider].call(
237 retryPrompt,
238 model,
239 temperatureNum,
240 maxTokens
241 );
242 currentResponse = retryResponse;
243 attemptsLeft--;
244 } else {
245 // No attempts left
246 log.error(`Failed to parse JSON after multiple attempts for item ${i + 1}. Using raw response as single column.`);
247 break;
248 }
249 }
250 }
251
252 if (success && typeof parsedData === 'object' && parsedData !== null) {
253 // Push multiple columns
254 const outputItem: Record<string, unknown> = { ...item };
255 for (const key of Object.keys(parsedData)) {
256 outputItem[key] = parsedData[key];
257 }
258 await Actor.pushData(outputItem);
259 } else {
260 // Fallback to single column mode if JSON parsing failed
261 const fallbackItem = { ...item, llmresponse: currentResponse };
262 await Actor.pushData(fallbackItem);
263 }
264 } else {
265 // Single column mode: Just push the response
266 item.llmresponse = llmresponse;
267 await Actor.pushData(item);
268 }
269
270 // Respect rate limits
271 await new Promise(resolve => setTimeout(resolve, REQUEST_INTERVAL_MS));
272 } catch (error: unknown) {
273 if (error instanceof Error) {
274 log.error(`Error processing item ${i + 1}: ${error.message}`);
275 } else {
276 log.error(`Error processing item ${i + 1}: Unknown error occurred`);
277 }
278
279 // Decide whether to continue or break. For now, we rethrow to fail the actor on error.
280 throw error;
281 }
282 }
283
284 log.info('Actor finished successfully');
285} catch (error: unknown) {
286 if (error instanceof Error) {
287 log.error(`Actor failed: ${error.message}`);
288 } else {
289 log.error('Actor failed: Unknown error occurred');
290 }
291 throw error;
292} finally {
293 await Actor.exit();
294}
src/types.ts
1export interface Input {
2 inputDatasetId: string;
3 defaultDatasetId: string;
4 llmApiToken: string;
5 prompt: string;
6 model: string;
7 temperature: string;
8 maxTokens: number;
9 skipItemIfEmpty?: boolean;
10 multipleColumns?: boolean;
11 provider?: 'openai' | 'anthropic' | 'google';
12 testPrompt?: boolean;
13 testItemsCount?: number;
14}
15
16export interface OutputItem extends Record<string, any> {
17 LLMResponse: string | null;
18}
19
20export interface LLMProvider {
21 call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string>;
22}
src/providers/anthropic.ts
1import Anthropic from '@anthropic-ai/sdk';
2import { LLMProvider } from '../types.js';
3
4export class AnthropicProvider implements LLMProvider {
5 private client: Anthropic;
6
7 constructor(apiKey: string) {
8 this.client = new Anthropic({ apiKey });
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const message = await this.client.messages.create({
13 model,
14 max_tokens: maxTokens,
15 temperature,
16 messages: [{ role: 'user', content: promptText }],
17 });
18
19 if (!message.content || message.content.length === 0) {
20 return '';
21 }
22
23 const textContent = message.content.find(c => c.type === 'text');
24 return textContent?.text || '';
25 }
26}
src/providers/google.ts
1import { GoogleGenerativeAI } from '@google/generative-ai';
2import { LLMProvider } from '../types.js';
3
4export class GoogleProvider implements LLMProvider {
5 private client: GoogleGenerativeAI;
6
7 constructor(apiKey: string) {
8 this.client = new GoogleGenerativeAI(apiKey);
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const genModel = this.client.getGenerativeModel({ model });
13
14 const result = await genModel.generateContent({
15 contents: [{ role: 'user', parts: [{ text: promptText }] }],
16 generationConfig: {
17 temperature,
18 maxOutputTokens: maxTokens,
19 },
20 });
21
22 const response = result.response;
23 return response.text();
24 }
25}
src/providers/index.ts
1export * from './openai.js';
2export * from './anthropic.js';
3export * from './google.js';
4
5export const getProvider = (model: string, explicitProvider?: string): 'openai' | 'anthropic' | 'google' => {
6 if (explicitProvider) {
7 return explicitProvider as 'openai' | 'anthropic' | 'google';
8 }
9 if (model.startsWith('claude')) return 'anthropic';
10 if (model.startsWith('gemini')) return 'google';
11 return 'openai';
12};
src/providers/openai.ts
1import OpenAI from 'openai';
2import { LLMProvider } from '../types.js';
3
4export class OpenAIProvider implements LLMProvider {
5 private client: OpenAI;
6
7 constructor(apiKey: string) {
8 this.client = new OpenAI({ apiKey });
9 }
10
11 async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12 const completion = await this.client.chat.completions.create({
13 messages: [{ role: 'user', content: promptText }],
14 model,
15 temperature,
16 max_tokens: maxTokens,
17 });
18 return completion.choices[0]?.message?.content || '';
19 }
20}
Developer
Maintained by Community
Actor Metrics
0 monthly users
-
0 No stars yet
>99% runs succeeded
Created in Dec 2024
Modified 2 days ago
Categories