LLM Dataset Processor avatar

LLM Dataset Processor

Under maintenance
Try for free

No credit card required

Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
LLM Dataset Processor

LLM Dataset Processor

dusan.vystrcil/llm-dataset-processor
Try for free

No credit card required

Allows you to process whole dataset with single LLM prompt. It's useful if you need to enrich data, summarize content, extract specific information, or manipulate data in a structured way using AI.

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5.vscode
6dist
7node_modules
8apify_storage
9storage
10.env
11
12# Added by Apify CLI
13.venv

CHANGELOG.md

1### 0.0.45 (2024-12-10)
2- First version

package.json

1{
2	"name": "LLMDatasetTest",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is a boilerplate of an Apify actor.",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"@anthropic-ai/sdk": "^0.32.1",
11		"@google/generative-ai": "^0.21.0",
12		"apify": "^3.2.6",
13		"crawlee": "^3.11.5",
14		"openai": "^4.76.0"
15	},
16	"devDependencies": {
17		"@apify/eslint-config-ts": "^0.3.0",
18		"@apify/tsconfig": "^0.1.0",
19		"@typescript-eslint/eslint-plugin": "^7.18.0",
20		"@typescript-eslint/parser": "^7.18.0",
21		"eslint": "^8.50.0",
22		"tsx": "^4.6.2",
23		"typescript": "^5.3.3"
24	},
25	"scripts": {
26		"start": "npm run start:dev",
27		"start:prod": "node dist/main.js",
28		"start:dev": "tsx src/main.ts",
29		"build": "tsc",
30		"lint": "eslint ./src --ext .ts",
31		"lint:fix": "eslint ./src --ext .ts --fix",
32		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
33	},
34	"author": "It's not you it's me",
35	"license": "ISC"
36}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "llm-dataset-processor",
4	"title": "LLM Dataset Processor",
5	"description": "Process or enrich datasets with LLM-generated content.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "ts-crawlee-cheerio"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "LLM Dataset Processor",
3    "description": "Choose specific dataset to process, select LLM, provide API token and craft your prompt template. We recommend to test your prompt first by enabling `Test Prompt Mode`. ",
4    "type": "object",
5    "schemaVersion": 1,
6    "required": [
7        "inputDatasetId",
8        "llmApiToken",
9        "prompt",
10        "model",
11        "temperature",
12        "maxTokens"
13    ],
14    "properties": {
15        "inputDatasetId": {
16            "type": "string",
17            "title": "Input Dataset ID",
18            "description": "The ID of the dataset to process.",
19            "resourceType": "dataset"
20        },
21        "model": {
22            "type": "string",
23            "title": "Large Language Model",
24            "description": "The LLM to use for processing. Each model has different capabilities and pricing. GPT-4o-mini and Claude 3.5 Haiku are recommended for cost-effective processing, while models like Claude 3 Opus or GPT-4o offer higher quality but at a higher cost.",
25            "editor": "select",
26            "enumTitles": ["GPT-4o mini (Recommended)", "GPT-4o", "Claude 3.5 Haiku (Recommended)", "Claude 3.5 Sonnet", "Claude 3 Opus", "Gemini 1.5 Flash", "Gemini 1.5 Flash-8B (Recommended)" ,"Gemini 1.5 Pro"],
27            "enum": ["gpt-4o-mini", "gpt-4o", "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest", "gemini-1.5-flash", "gemini-1.5-flash-8b", "gemini-1.5-pro"]
28        },
29        "llmApiToken": {
30            "type": "string",
31            "title": "LLM Provider API Token",
32            "editor": "textfield",
33            "description": "Your API token for the LLM Provider (e.g., OpenAI).",
34            "isSecret": true
35        },
36        "temperature": {
37            
38            "type": "string",
39            "title": "Temperature",
40            "editor": "textfield",
41            "description": "Sampling temperature for the LLM API (controls randomness). We recommend to use a value closer to 0 for exact results. In case of more 'creative' results, we recommend to use a value closer to 1.",
42            "default": "0.1"
43        },
44        "multipleColumns": {
45            "type": "boolean",
46            "title": "Multiple columns in output",
47            "description": "When enabled, instructs the LLM to return responses as JSON objects, creating multiple columns in the output dataset. The columns need to be named and described in the prompt. If disabled, responses are stored in a single `llmresponse` column.",
48            "default": false
49            },
50        "prompt": {
51            "type": "string",
52            "title": "Prompt",
53            "editor": "textarea",
54            "minLength": 1,
55            "description": "The prompt template to send to the LLM API. \n\nUse `{{fieldName}}` placeholders to insert values from the input dataset (e.g., `Summarize this text: {{content.text}}`). For multiple columns output, ensure your prompt contains the names and descriptions of the desired columns in output.\n\n See README for more details.",
56            "prefill": "Summarize this text: {{text}}"
57        },
58        "skipItemIfEmpty": {
59            "type": "boolean",
60            "title": "Skip item if one or more {{field}} are empty",
61            "description": "When enabled, items will be skipped if any {{field}} referenced in the prompt is empty, null, undefined, or contains only whitespace. This helps prevent processing incomplete data.",
62            "default": true
63        },
64        "maxTokens": {
65            "type": "integer",
66            "title": "Max Tokens",
67            "editor": "number",
68            "description": "Maximum number of tokens in the LLM API response.",
69            "default": 150
70        },
71        "testPrompt": {
72            "type": "boolean",
73            "title": "Test Prompt Mode",
74            "description": "Test mode that processes only a limited number of items (defined by `testItemsCount`). Use this to validate your prompt and configuration before running on the full dataset. We highly recommend enabling this option first to validate your prompt because of ambiguity of the LLM responses.",
75            "default": true
76        },
77        "testItemsCount": {
78            "type": "integer",
79            "title": "Test Items Count",
80            "description": "Number of items to process when `Test Prompt Mode` is enabled.",
81            "default": 3,
82            "minimum": 1
83        }
84    }
85}

src/main.ts

1// main.ts
2
3import { Actor, log } from 'apify';
4import { OpenAIProvider, AnthropicProvider, GoogleProvider, getProvider } from './providers/index.js';
5import { Input, OutputItem } from './types.js';
6
7// Rate limits for OpenAI API lowest tier
8const RATE_LIMIT_PER_MINUTE = 500;
9const REQUEST_INTERVAL_MS = Math.ceil(60000 / RATE_LIMIT_PER_MINUTE); // Interval between requests in ms
10
11await Actor.init();
12
13try {
14    // Get the input
15    const input: Input | null = await Actor.getInput();
16
17    if (!input) {
18        throw new Error('No input provided. Please provide the necessary input parameters.');
19    }
20
21    const {
22        llmApiToken,
23        prompt,
24        model,
25        temperature,
26        maxTokens,
27        skipItemIfEmpty,
28        multipleColumns = false,
29        provider: explicitProvider,
30        testPrompt = false,
31        testItemsCount = 3,
32    } = input;
33
34    const inputDatasetId = input?.inputDatasetId || input?.defaultDatasetId;
35
36    if (!inputDatasetId) {
37        throw new Error('No inputDatasetId provided.');
38    }
39
40    // Log configuration details
41    const configDetails = {
42        datasetId: inputDatasetId,
43        model: model,
44        promptTemplate: prompt,
45        multipleColumns
46    };
47    log.info('Configuration details:', configDetails);
48
49    // Helper function to get nested field value using dot notation
50    function getNestedValue(obj: any, path: string): any {
51        return path.split('.').reduce((current, key) => current && current[key], obj);
52    }
53
54    // Helper function to check if a value is empty
55    function isEmpty(value: any): boolean {
56        if (value === undefined || value === null) return true;
57        if (typeof value === 'string') return value.trim() === '';
58        if (Array.isArray(value)) return value.length === 0;
59        if (typeof value === 'object') return Object.keys(value).length === 0;
60        return false;
61    }
62
63    // Helper function to check if any placeholder field is empty
64    function hasEmptyFields(promptStr: string, item: OutputItem): boolean {
65        const fieldMatches = promptStr.match(/\{\{([^}]+)\}\}/g) || [];
66        return fieldMatches.some(match => {
67            const field = match.slice(2, -2).trim(); // Remove {{ and }}
68            const value = getNestedValue(item, field);
69            return isEmpty(value);
70        });
71    }
72
73    // Helper function to replace field placeholders in prompt with actual values
74    function replacePlaceholders(promptStr: string, item: OutputItem): string {
75        return promptStr.replace(/\{\{([^}]+)\}\}/g, (_match, fieldName: string) => {
76            const value = getNestedValue(item, fieldName.trim());
77            return value !== undefined ? String(value) : '';
78        });
79    }
80
81    // Build the final prompt:
82    // If multipleColumns is true, we instruct the LLM to return a strict JSON object.
83    function buildFinalPrompt(promptText: string): string {
84        if (!multipleColumns) {
85            return promptText;
86        }
87
88        // Append clear instructions to return JSON only
89        return `${promptText}
90
91Important: Return only a strict JSON object with the requested fields as keys. No extra text or explanations, no markdown, just JSON.`;
92    }
93
94    // Fetch items from the input dataset
95    let items: OutputItem[] = [];
96    try {
97        // First check if dataset exists
98        const dataset = await Actor.apifyClient.dataset(inputDatasetId).get();
99        if (!dataset) {
100            throw new Error(`Dataset with ID ${inputDatasetId} does not exist`);
101        }
102
103        const inputDataset = await Actor.openDataset<OutputItem>(inputDatasetId);
104        const { items: fetchedItems } = await inputDataset.getData();
105        items = fetchedItems;
106
107        // If test mode is enabled, limit the number of items
108        if (testPrompt) {
109            const itemCount = Math.min(testItemsCount, items.length);
110            items = items.slice(0, itemCount);
111            log.info(`Test mode enabled - processing ${itemCount} items out of ${fetchedItems.length}`);
112        } else {
113            log.info(`Fetched ${items.length} items from the input dataset.`);
114        }
115    } catch (datasetError: unknown) {
116        if (datasetError instanceof Error) {
117            log.error(`Error accessing dataset: ${datasetError.message}`);
118        } else {
119            log.error('Error accessing dataset: Unknown error occurred');
120        }
121        throw datasetError;
122    }
123
124    // Initialize API clients
125    const providers = {
126        openai: new OpenAIProvider(llmApiToken),
127        anthropic: new AnthropicProvider(llmApiToken),
128        google: new GoogleProvider(llmApiToken),
129    };
130
131    // Convert temperature from string to number
132    const temperatureNum = parseFloat(temperature);
133
134    // If multipleColumns is true, we can do a validation step with a sample item.
135    async function validateJsonFormat(testItem: OutputItem): Promise<boolean> {
136        if (!multipleColumns) return true; // No need to validate if single column.
137
138        const provider = getProvider(model, explicitProvider);
139        let finalPrompt = replacePlaceholders(buildFinalPrompt(prompt), testItem);
140
141        for (let attempt = 1; attempt <= 3; attempt++) {
142            try {
143                const testResponse = await providers[provider].call(
144                    finalPrompt,
145                    model,
146                    temperatureNum,
147                    maxTokens
148                );
149                
150                // First check if we got an empty response
151                if (!testResponse) {
152                    log.error('Empty response received from the API');
153                    throw new Error('Empty response received from the API');
154                }
155
156                // Try parsing as JSON:
157                try {
158                    JSON.parse(testResponse);
159                    return true; // JSON parsed successfully
160                } catch (jsonError) {
161                    if (attempt < 3) {
162                        log.warning(`JSON validation attempt ${attempt} failed. Retrying...`);
163                        log.debug('Response that failed JSON parsing:', { response: testResponse });
164                        finalPrompt = `${finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
165                    } else {
166                        log.error('JSON validation attempts exhausted. The prompt may not produce valid JSON.');
167                        log.debug('Final response that failed JSON parsing:', { response: testResponse });
168                        return false;
169                    }
170                }
171            } catch (apiError: any) {
172                // Log the full error for debugging
173                log.error('API call failed:', { 
174                    error: apiError.message,
175                    type: apiError.type,
176                    code: apiError.code,
177                    param: apiError.param
178                });
179                
180                // Rethrow API errors immediately instead of retrying
181                throw apiError;
182            }
183        }
184        return false;
185    }
186
187    if (items.length > 0) {
188        const validationResult = await validateJsonFormat(items[0]);
189        if (multipleColumns && !validationResult) {
190            throw new Error('Failed to produce valid JSON after multiple attempts. Please adjust your prompt or disable multiple columns.');
191        }
192    }
193
194    // Process each item
195    for (let i = 0; i < items.length; i++) {
196        const item = items[i];
197
198        try {
199            // Skip if any required fields are empty and skipItemIfEmpty is true
200            if (skipItemIfEmpty && hasEmptyFields(prompt, item)) {
201                log.info(`Skipping item ${i + 1} due to empty fields`);
202                continue;
203            }
204
205            // Replace placeholders in the prompt
206            let finalPrompt = replacePlaceholders(buildFinalPrompt(prompt), item);
207            log.info(`Processing item ${i + 1}/${items.length}`, { prompt: finalPrompt });
208
209            // Determine the provider and make the API call
210            const provider = getProvider(model, explicitProvider); // returns 'openai' | 'anthropic' | 'google'
211            let llmresponse = await providers[provider].call(
212                finalPrompt,
213                model,
214                temperatureNum,
215                maxTokens
216            );
217
218            log.info(`Item ${i + 1} response:`, { response: llmresponse });
219
220            if (multipleColumns) {
221                let parsedData: any;
222                let attemptsLeft = 2; // After initial call, try parsing or retrying up to 2 times more if needed
223                let currentResponse = llmresponse;
224                let success = false;
225
226                while (attemptsLeft >= 0) {
227                    try {
228                        parsedData = JSON.parse(currentResponse);
229                        success = true;
230                        break;
231                    } catch (err) {
232                        if (attemptsLeft > 0) {
233                            // Retry by asking again for correct JSON
234                            log.warning(`Failed to parse JSON for item ${i + 1}. Retrying...`);
235                            const retryPrompt = `${finalPrompt}\n\nThe last response was not valid JSON. Please return valid JSON this time.`;
236                            const retryResponse = await providers[provider].call(
237                                retryPrompt,
238                                model,
239                                temperatureNum,
240                                maxTokens
241                            );
242                            currentResponse = retryResponse;
243                            attemptsLeft--;
244                        } else {
245                            // No attempts left
246                            log.error(`Failed to parse JSON after multiple attempts for item ${i + 1}. Using raw response as single column.`);
247                            break;
248                        }
249                    }
250                }
251
252                if (success && typeof parsedData === 'object' && parsedData !== null) {
253                    // Push multiple columns
254                    const outputItem: Record<string, unknown> = { ...item };
255                    for (const key of Object.keys(parsedData)) {
256                        outputItem[key] = parsedData[key];
257                    }
258                    await Actor.pushData(outputItem);
259                } else {
260                    // Fallback to single column mode if JSON parsing failed
261                    const fallbackItem = { ...item, llmresponse: currentResponse };
262                    await Actor.pushData(fallbackItem);
263                }
264            } else {
265                // Single column mode: Just push the response
266                item.llmresponse = llmresponse;
267                await Actor.pushData(item);
268            }
269
270            // Respect rate limits
271            await new Promise(resolve => setTimeout(resolve, REQUEST_INTERVAL_MS));
272        } catch (error: unknown) {
273            if (error instanceof Error) {
274                log.error(`Error processing item ${i + 1}: ${error.message}`);
275            } else {
276                log.error(`Error processing item ${i + 1}: Unknown error occurred`);
277            }
278
279            // Decide whether to continue or break. For now, we rethrow to fail the actor on error.
280            throw error;
281        }
282    }
283
284    log.info('Actor finished successfully');
285} catch (error: unknown) {
286    if (error instanceof Error) {
287        log.error(`Actor failed: ${error.message}`);
288    } else {
289        log.error('Actor failed: Unknown error occurred');
290    }
291    throw error;
292} finally {
293    await Actor.exit();
294}

src/types.ts

1export interface Input {
2    inputDatasetId: string;
3    defaultDatasetId: string;
4    llmApiToken: string;
5    prompt: string;
6    model: string;
7    temperature: string;
8    maxTokens: number;
9    skipItemIfEmpty?: boolean;
10    multipleColumns?: boolean;
11    provider?: 'openai' | 'anthropic' | 'google';
12    testPrompt?: boolean;
13    testItemsCount?: number;
14}
15
16export interface OutputItem extends Record<string, any> {
17    LLMResponse: string | null;
18}
19
20export interface LLMProvider {
21    call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string>;
22}

src/providers/anthropic.ts

1import Anthropic from '@anthropic-ai/sdk';
2import { LLMProvider } from '../types.js';
3
4export class AnthropicProvider implements LLMProvider {
5    private client: Anthropic;
6
7    constructor(apiKey: string) {
8        this.client = new Anthropic({ apiKey });
9    }
10
11    async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12        const message = await this.client.messages.create({
13            model,
14            max_tokens: maxTokens,
15            temperature,
16            messages: [{ role: 'user', content: promptText }],
17        });
18        
19        if (!message.content || message.content.length === 0) {
20            return '';
21        }
22
23        const textContent = message.content.find(c => c.type === 'text');
24        return textContent?.text || '';
25    }
26}

src/providers/google.ts

1import { GoogleGenerativeAI } from '@google/generative-ai';
2import { LLMProvider } from '../types.js';
3
4export class GoogleProvider implements LLMProvider {
5    private client: GoogleGenerativeAI;
6
7    constructor(apiKey: string) {
8        this.client = new GoogleGenerativeAI(apiKey);
9    }
10
11    async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12        const genModel = this.client.getGenerativeModel({ model });
13        
14        const result = await genModel.generateContent({
15            contents: [{ role: 'user', parts: [{ text: promptText }] }],
16            generationConfig: {
17                temperature,
18                maxOutputTokens: maxTokens,
19            },
20        });
21
22        const response = result.response;
23        return response.text();
24    }
25}

src/providers/index.ts

1export * from './openai.js';
2export * from './anthropic.js';
3export * from './google.js';
4
5export const getProvider = (model: string, explicitProvider?: string): 'openai' | 'anthropic' | 'google' => {
6    if (explicitProvider) {
7        return explicitProvider as 'openai' | 'anthropic' | 'google';
8    }
9    if (model.startsWith('claude')) return 'anthropic';
10    if (model.startsWith('gemini')) return 'google';
11    return 'openai';
12};

src/providers/openai.ts

1import OpenAI from 'openai';
2import { LLMProvider } from '../types.js';
3
4export class OpenAIProvider implements LLMProvider {
5    private client: OpenAI;
6
7    constructor(apiKey: string) {
8        this.client = new OpenAI({ apiKey });
9    }
10
11    async call(promptText: string, model: string, temperature: number, maxTokens: number): Promise<string> {
12        const completion = await this.client.chat.completions.create({
13            messages: [{ role: 'user', content: promptText }],
14            model,
15            temperature,
16            max_tokens: maxTokens,
17        });
18        return completion.choices[0]?.message?.content || '';
19    }
20}
Developer
Maintained by Community

Actor Metrics

  • 0 monthly users

  • 0 No stars yet

  • >99% runs succeeded

  • Created in Dec 2024

  • Modified 2 days ago

Categories