Parsera avatar

Parsera

Try for free

No credit card required

Go to Store
Parsera

Parsera

parsera-labs/parsera
Try for free

No credit card required

Extract Data from ANY website with Parsera.org

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38    && npm install --omit=dev --omit=optional \
39    && echo "Installed NPM packages:" \
40    && (npm list --omit=dev --all || true) \
41    && echo "Node.js version:" \
42    && node --version \
43    && echo "NPM version:" \
44    && npm --version \
45    && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY --from=builder /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "parsera",
4    "title": "Parsera Data Extractor",
5    "description": "Extract structured data from any webpage using AI-powered Parsera API",
6    "version": "1.0",
7    "buildTag": "latest",
8    "minMemoryMbytes": 256,
9    "maxMemoryMbytes": 4096,
10    "dockerfile": "./Dockerfile",
11    "input": "./input_schema.json"
12}

.actor/input_schema.json

1{
2    "description": "Configure the data extraction parameters for Parsera API",
3    "properties": {
4        "apiKey": {
5            "description": "Your Parsera API key for authentication",
6            "editor": "textfield",
7            "isSecret": true,
8            "title": "Parsera API Key",
9            "type": "string"
10        },
11        "attributes": {
12            "description": "Define what data to extract from the webpage",
13            "editor": "json",
14            "prefill": [
15                {
16                    "description": "The main title of the page",
17                    "name": "title"
18                }
19            ],
20            "sectionCaption": "Extraction Settings",
21            "sectionDescription": "Configure what data to extract from the webpage",
22            "title": "Extraction Attributes",
23            "type": "array"
24        },
25        "cookies": {
26            "description": "Custom cookies to use for the request (optional)",
27            "editor": "json",
28            "nullable": true,
29            "title": "Custom Cookies",
30            "type": "array"
31        },
32        "precisionMode": {
33            "default": false,
34            "description": "Enable for more accurate but slower extraction",
35            "nullable": true,
36            "title": "Precision Mode",
37            "type": "boolean"
38        },
39        "proxyCountry": {
40            "description": "Country for the proxy IP (optional)",
41            "editor": "select",
42            "enum": [
43                "random",
44                "Afghanistan",
45                "Albania",
46                "Algeria",
47                "Argentina",
48                "Armenia",
49                "Aruba",
50                "Australia",
51                "Austria",
52                "Azerbaijan",
53                "Bahamas",
54                "Bahrain",
55                "Bangladesh",
56                "Belarus",
57                "Belgium",
58                "BosniaandHerzegovina",
59                "Brazil",
60                "BritishVirginIslands",
61                "Brunei",
62                "Bulgaria",
63                "Cambodia",
64                "Cameroon",
65                "Canada",
66                "Chile",
67                "China",
68                "Colombia",
69                "CostaRica",
70                "Croatia",
71                "Cuba",
72                "Cyprus",
73                "Chechia",
74                "Denmark",
75                "DominicanRepublic",
76                "Ecuador",
77                "Egypt",
78                "ElSalvador",
79                "Estonia",
80                "Ethiopia",
81                "Finland",
82                "France",
83                "Georgia",
84                "Germany",
85                "Ghana",
86                "Greece",
87                "Guatemala",
88                "Guyana",
89                "HashemiteKingdomofJordan",
90                "HongKong",
91                "Hungary",
92                "India",
93                "Indonesia",
94                "Iran",
95                "Iraq",
96                "Ireland",
97                "Israel",
98                "Italy",
99                "Jamaica",
100                "Japan",
101                "Kazakhstan",
102                "Kenya",
103                "Kosovo",
104                "Kuwait",
105                "Latvia",
106                "Liechtenstein",
107                "Luxembourg",
108                "Macedonia",
109                "Madagascar",
110                "Malaysia",
111                "Mauritius",
112                "Mexico",
113                "Mongolia",
114                "Montenegro",
115                "Morocco",
116                "Mozambique",
117                "Myanmar",
118                "Nepal",
119                "Netherlands",
120                "NewZealand",
121                "Nigeria",
122                "Norway",
123                "Oman",
124                "Pakistan",
125                "Palestine",
126                "Panama",
127                "PapuaNewGuinea",
128                "Paraguay",
129                "Peru",
130                "Philippines",
131                "Poland",
132                "Portugal",
133                "PuertoRico",
134                "Qatar",
135                "RepublicOfLithuania",
136                "RepublicOfMoldova",
137                "Romania",
138                "Russia",
139                "SaudiArabia",
140                "Senegal",
141                "Serbia",
142                "Seychelles",
143                "Singapore",
144                "Slovakia",
145                "Slovenia",
146                "Somalia",
147                "SouthAfrica",
148                "SouthKorea",
149                "Spain",
150                "SriLanka",
151                "Sudan",
152                "Suriname",
153                "Sweden",
154                "Switzerland",
155                "Syria",
156                "Taiwan",
157                "Tajikistan",
158                "Thailand",
159                "TrinidadandTobago",
160                "Tunisia",
161                "Turkey",
162                "Uganda",
163                "Ukraine",
164                "UnitedArabEmirates",
165                "UnitedKingdom",
166                "UnitedStates",
167                "Uzbekistan",
168                "Venezuela",
169                "Vietnam",
170                "Zambia"
171            ],
172            "enumTitles": [
173                "Random Country",
174                "Afghanistan",
175                "Albania",
176                "Algeria",
177                "Argentina",
178                "Armenia",
179                "Aruba",
180                "Australia",
181                "Austria",
182                "Azerbaijan",
183                "Bahamas",
184                "Bahrain",
185                "Bangladesh",
186                "Belarus",
187                "Belgium",
188                "Bosnia and Herzegovina",
189                "Brazil",
190                "British Virgin Islands",
191                "Brunei",
192                "Bulgaria",
193                "Cambodia",
194                "Cameroon",
195                "Canada",
196                "Chile",
197                "China",
198                "Colombia",
199                "Costa Rica",
200                "Croatia",
201                "Cuba",
202                "Cyprus",
203                "Chechia",
204                "Denmark",
205                "Dominican Republic",
206                "Ecuador",
207                "Egypt",
208                "El Salvador",
209                "Estonia",
210                "Ethiopia",
211                "Finland",
212                "France",
213                "Georgia",
214                "Germany",
215                "Ghana",
216                "Greece",
217                "Guatemala",
218                "Guyana",
219                "Hashemite Kingdom of Jordan",
220                "Hong Kong",
221                "Hungary",
222                "India",
223                "Indonesia",
224                "Iran",
225                "Iraq",
226                "Ireland",
227                "Israel",
228                "Italy",
229                "Jamaica",
230                "Japan",
231                "Kazakhstan",
232                "Kenya",
233                "Kosovo",
234                "Kuwait",
235                "Latvia",
236                "Liechtenstein",
237                "Luxembourg",
238                "Macedonia",
239                "Madagascar",
240                "Malaysia",
241                "Mauritius",
242                "Mexico",
243                "Mongolia",
244                "Montenegro",
245                "Morocco",
246                "Mozambique",
247                "Myanmar",
248                "Nepal",
249                "Netherlands",
250                "New Zealand",
251                "Nigeria",
252                "Norway",
253                "Oman",
254                "Pakistan",
255                "Palestine",
256                "Panama",
257                "Papua New Guinea",
258                "Paraguay",
259                "Peru",
260                "Philippines",
261                "Poland",
262                "Portugal",
263                "Puerto Rico",
264                "Qatar",
265                "Republic of Lithuania",
266                "Republic of Moldova",
267                "Romania",
268                "Russia",
269                "Saudi Arabia",
270                "Senegal",
271                "Serbia",
272                "Seychelles",
273                "Singapore",
274                "Slovakia",
275                "Slovenia",
276                "Somalia",
277                "South Africa",
278                "South Korea",
279                "Spain",
280                "Sri Lanka",
281                "Sudan",
282                "Suriname",
283                "Sweden",
284                "Switzerland",
285                "Syria",
286                "Taiwan",
287                "Tajikistan",
288                "Thailand",
289                "Trinidad and Tobago",
290                "Tunisia",
291                "Turkey",
292                "Uganda",
293                "Ukraine",
294                "United Arab Emirates",
295                "United Kingdom",
296                "United States",
297                "Uzbekistan",
298                "Venezuela",
299                "Vietnam",
300                "Zambia"
301            ],
302            "nullable": true,
303            "sectionCaption": "Advanced Options",
304            "sectionDescription": "Optional settings for customizing the extraction process",
305            "title": "Proxy Location",
306            "type": "string"
307        },
308        "url": {
309            "description": "The webpage URL to extract data from",
310            "editor": "textfield",
311            "pattern": "^https?://[^\\s/$.?#].[^\\s]*$",
312            "sectionCaption": "Basic Configuration",
313            "sectionDescription": "Essential settings for data extraction",
314            "title": "Target URL",
315            "type": "string"
316        }
317    },
318    "required": ["url", "apiKey", "attributes"],
319    "schemaVersion": 1,
320    "title": "Parsera Data Extractor Input",
321    "type": "object"
322}

src/config/constants.ts

1export const PARSERA_API_BASE_URL = "https://api.parsera.org/v1";
2export const DEFAULT_PROXY_COUNTRY = "UnitedStates";

src/schemas/input.ts

1import { z } from "zod";
2import { PARSERA_API_BASE_URL } from "../config/constants.js";
3import { log } from "apify";
4
5export const AttributeSchema = z.object({
6    name: z.string().min(1, "Attribute name must not be empty"),
7    description: z.string().min(1, "Attribute description must not be empty"),
8});
9
10export const CookieSchema = z
11    .object({
12        sameSite: z.enum(["None", "Lax", "Strict"]),
13    })
14    .catchall(z.string());
15
16/**
17 * Fetches the list of valid proxy countries from Parsera API
18 * @returns A Zod enum schema of valid proxy countries
19 */
20export const getProxyCountriesSchema = async () => {
21    try {
22        const response = await fetch(`${PARSERA_API_BASE_URL}/proxy-countries`);
23        if (!response.ok) {
24            throw new Error(
25                `Failed to fetch proxy countries: ${response.statusText}`
26            );
27        }
28        const data = (await response.json()) as Record<string, string>;
29        const proxyCountries = Object.keys(data);
30        return z.enum(["random", ...proxyCountries] as [string, ...string[]]);
31    } catch (error) {
32        log.warning(
33            "Failed to fetch proxy countries. Using default validation.",
34            { error }
35        );
36        // Fallback to basic string validation if API call fails
37        return z.string();
38    }
39};
40
41// Create the input schema with async proxy country validation
42export const createInputSchema = async () => {
43    const proxyCountriesSchema = await getProxyCountriesSchema();
44
45    return z.object({
46        url: z.string().url("Invalid URL format"),
47        apiKey: z.string().min(1, "API key must not be empty"),
48        attributes: z
49            .array(AttributeSchema)
50            .min(1, "At least one attribute is required"),
51        proxyCountry: proxyCountriesSchema.optional(),
52        cookies: z.array(CookieSchema).optional(),
53        precisionMode: z.boolean().optional(),
54    });
55};

src/services/parsera.ts

1import {
2    ParseraError,
3    ParseraRequestBody,
4    ParseraResponse,
5} from "../types/parsera.js";
6
7export interface ParseraRetryOptions {
8    /**
9     * Maximum number of retry attempts for failed requests
10     * Applies to network errors, timeouts, and rate limits
11     * @default 3
12     */
13    maxRetries?: number;
14
15    /**
16     * Multiplier for exponential backoff between retries
17     * Each retry will wait: initialDelay * (backoffFactor ^ retryCount)
18     * Example: 1000ms, then 2000ms, then 4000ms
19     * @default 2
20     */
21    backoffFactor?: number;
22
23    /**
24     * Initial delay (in milliseconds) before the first retry
25     * This delay will be multiplied by backoffFactor for subsequent retries
26     * @default 1000 (1 second)
27     */
28    initialDelay?: number;
29}
30
31export interface ParseraAttribute {
32    /**
33     * Name of the attribute to extract
34     * This will be the key in the returned data object
35     */
36    name: string;
37
38    /**
39     * Natural language description of what to extract
40     * Be as specific as possible about the data you want
41     */
42    description: string;
43}
44
45export interface ParseraCookie {
46    /**
47     * Cookie properties as key-value pairs
48     */
49    [key: string]: string;
50
51    /**
52     * SameSite attribute for the cookie
53     * Controls how the cookie behaves with cross-site requests
54     */
55    sameSite: "None" | "Lax" | "Strict";
56}
57
58export interface ParseraOptions {
59    /**
60     * Your Parsera API key
61     * Required for authentication with the API
62     */
63    apiKey: string;
64
65    /**
66     * Base URL for the Parsera API
67     * Only change this if you're using a custom API endpoint
68     * @default "https://api.parsera.org/v1"
69     */
70    baseUrl?: string;
71
72    /**
73     * Default country code for proxy servers
74     * Used when no specific proxyCountry is provided in the request
75     * @example "random" | "UnitedStates" | "UnitedKingdom" | "Germany" | "Japan" | "France" | "Canada"
76     * @default "UnitedStates"
77     * @see https://api.parsera.org/v1/proxy-countries for full list of supported countries
78     */
79    defaultProxyCountry?: string;
80
81    /**
82     * Maximum time (in milliseconds) to wait for each API request
83     * If a request takes longer, it will be aborted and retried
84     * @default 30000 (30 seconds)
85     */
86    timeout?: number;
87
88    /**
89     * Configuration for retry behavior on failed requests
90     * @see ParseraRetryOptions
91     */
92    retryOptions?: ParseraRetryOptions;
93}
94
95export interface ExtractOptions {
96    /**
97     * URL of the webpage to extract data from
98     * Must be a valid HTTP/HTTPS URL
99     */
100    url: string;
101
102    /**
103     * Attributes to extract from the webpage
104     * Can be either an array of ParseraAttribute objects
105     * or a Record of name-description pairs
106     */
107    attributes: ParseraAttribute[] | Record<string, string>;
108
109    /**
110     * Country code for proxy server location
111     * Overrides the defaultProxyCountry setting
112     * @example "US" | "GB" | "DE" | "JP"
113     */
114    proxyCountry?: string;
115
116    /**
117     * Cookies to be sent with the request
118     * Useful for accessing pages that require authentication
119     */
120    cookies?: ParseraCookie[];
121
122    /**
123     * Enable precision mode for more accurate extractions
124     * May increase processing time
125     * @default false
126     */
127    precisionMode?: boolean;
128
129    /**
130     * AbortSignal for request cancellation
131     * Allows the request to be aborted if it exceeds timeout
132     * or if cancellation is needed
133     */
134    signal?: AbortSignal;
135}
136
137export type ParseraEventType =
138    | "request:start"
139    | "request:end"
140    | "request:retry"
141    | "request:error"
142    | "extract:start"
143    | "extract:complete"
144    | "extract:error"
145    | "rateLimit"
146    | "timeout"
147    | string; // Allow custom event types
148
149export interface ParseraEvent<T = unknown> {
150    type: ParseraEventType;
151    timestamp: number;
152    data?: T;
153    error?: Error;
154    retryCount?: number;
155}
156
157export type ParseraEventHandler<T = unknown> = (
158    event: ParseraEvent<T>
159) => void | Promise<void>;
160
161export interface ParseraEventOptions {
162    /**
163     * Whether to handle the event asynchronously
164     * When true, event handlers won't block the main execution
165     * @default false
166     */
167    async?: boolean;
168
169    /**
170     * Whether to catch errors in event handlers
171     * When true, errors in handlers won't affect the main execution
172     * @default true
173     */
174    catchErrors?: boolean;
175}
176
177export class Parsera {
178    private readonly apiKey: string;
179    private readonly baseUrl: string;
180    private readonly defaultProxyCountry: string;
181    private readonly timeout: number;
182    private readonly retryOptions: Required<ParseraRetryOptions>;
183    private lastRequestTime: number = 0;
184    private readonly minRequestInterval: number = 100; // 100ms between requests
185    private readonly eventHandlers = new Map<
186        ParseraEventType,
187        Set<ParseraEventHandler<any>>
188    >();
189    private readonly eventOptions = new Map<
190        ParseraEventType,
191        ParseraEventOptions
192    >();
193
194    /**
195     * Creates a new Parsera client instance.
196     *
197     * @example
198     * ```typescript
199     * const parsera = new Parsera({
200     *     apiKey: "your-api-key",
201     *     timeout: 60000, // 60 second timeout
202     *     retryOptions: {
203     *         maxRetries: 3,
204     *         backoffFactor: 2,
205     *         initialDelay: 1000,
206     *     }
207     * });
208     * ```
209     */
210    constructor({
211        apiKey,
212        baseUrl = "https://api.parsera.org/v1",
213        defaultProxyCountry = "US",
214        timeout = 30000,
215        retryOptions = {},
216    }: ParseraOptions) {
217        this.validateApiKey(apiKey);
218        this.apiKey = apiKey;
219        this.baseUrl = baseUrl;
220        this.defaultProxyCountry = defaultProxyCountry;
221        this.timeout = timeout;
222        this.retryOptions = {
223            maxRetries: retryOptions.maxRetries ?? 3,
224            backoffFactor: retryOptions.backoffFactor ?? 2,
225            initialDelay: retryOptions.initialDelay ?? 1000,
226        };
227    }
228
229    private validateApiKey(apiKey: string): void {
230        if (!apiKey || typeof apiKey !== "string" || apiKey.length < 32) {
231            throw new Error("Invalid API key format");
232        }
233    }
234
235    private validateUrl(url: string): void {
236        try {
237            new URL(url);
238        } catch {
239            throw new Error("Invalid URL format");
240        }
241    }
242
243    private async enforceRateLimit(): Promise<void> {
244        const now = Date.now();
245        const timeSinceLastRequest = now - this.lastRequestTime;
246
247        if (timeSinceLastRequest < this.minRequestInterval) {
248            await new Promise((resolve) =>
249                setTimeout(
250                    resolve,
251                    this.minRequestInterval - timeSinceLastRequest
252                )
253            );
254        }
255        this.lastRequestTime = Date.now();
256    }
257
258    private async fetchWithTimeout(
259        url: string,
260        options: RequestInit & { timeout?: number }
261    ): Promise<Response> {
262        const { timeout = this.timeout, ...fetchOptions } = options;
263
264        const controller = new AbortController();
265        if (options.signal) {
266            options.signal.addEventListener("abort", () => controller.abort());
267        }
268
269        const timeoutId = setTimeout(() => controller.abort(), timeout);
270
271        try {
272            const response = await fetch(url, {
273                ...fetchOptions,
274                signal: controller.signal,
275            });
276            return response;
277        } finally {
278            clearTimeout(timeoutId);
279        }
280    }
281
282    private async retryableRequest(
283        requestFn: () => Promise<Response>,
284        retryCount = 0
285    ): Promise<Response> {
286        try {
287            await this.enforceRateLimit();
288            const response = await requestFn();
289
290            if (
291                response.status === 429 &&
292                retryCount < this.retryOptions.maxRetries
293            ) {
294                await this.emit("rateLimit", { retryCount });
295                await this.emit("request:retry", { retryCount });
296
297                const delay =
298                    this.retryOptions.initialDelay *
299                    Math.pow(this.retryOptions.backoffFactor, retryCount);
300                await new Promise((resolve) => setTimeout(resolve, delay));
301                return this.retryableRequest(requestFn, retryCount + 1);
302            }
303
304            return response;
305        } catch (error) {
306            if (error instanceof Error) {
307                if (error.name === "AbortError") {
308                    await this.emit("timeout", undefined, error);
309                }
310                await this.emit("request:error", undefined, error);
311
312                if (
313                    retryCount < this.retryOptions.maxRetries &&
314                    this.isRetryableError(error)
315                ) {
316                    await this.emit("request:retry", { retryCount });
317                    const delay =
318                        this.retryOptions.initialDelay *
319                        Math.pow(this.retryOptions.backoffFactor, retryCount);
320                    await new Promise((resolve) => setTimeout(resolve, delay));
321                    return this.retryableRequest(requestFn, retryCount + 1);
322                }
323            }
324            throw error;
325        }
326    }
327
328    private isRetryableError(error: unknown): boolean {
329        return (
330            error instanceof Error &&
331            (error.message.includes("network") ||
332                error.message.includes("timeout") ||
333                error.message.includes("ECONNRESET"))
334        );
335    }
336
337    /**
338     * Converts a Record<string, string> to ParseraAttribute[]
339     */
340    private convertToAttributes(
341        attrs: Record<string, string>
342    ): ParseraAttribute[] {
343        return Object.entries(attrs).map(([name, description]) => ({
344            name,
345            description,
346        }));
347    }
348
349    /**
350     * Registers an event handler for a specific event type
351     *
352     * @param eventType - Type of event to listen for
353     * @param handler - Function to handle the event
354     * @param options - Configuration options for event handling
355     *
356     * @example
357     * ```typescript
358     * parsera.on('extract:complete', (event) => {
359     *     console.log(`Extraction completed with ${event.data.length} items`);
360     * });
361     *
362     * parsera.on('request:retry', (event) => {
363     *     console.log(`Retrying request (attempt ${event.retryCount})`);
364     * });
365     *
366     * // Custom event
367     * parsera.on('my:custom:event', (event) => {
368     *     console.log('Custom event data:', event.data);
369     * });
370     * ```
371     */
372    on<T = unknown>(
373        eventType: ParseraEventType,
374        handler: ParseraEventHandler<T>,
375        options: ParseraEventOptions = {}
376    ): void {
377        if (!this.eventHandlers.has(eventType)) {
378            this.eventHandlers.set(eventType, new Set());
379        }
380        this.eventHandlers
381            .get(eventType)!
382            .add(handler as ParseraEventHandler<any>);
383        this.eventOptions.set(eventType, {
384            async: options.async ?? false,
385            catchErrors: options.catchErrors ?? true,
386        });
387    }
388
389    /**
390     * Removes an event handler for a specific event type
391     */
392    off<T = unknown>(
393        eventType: ParseraEventType,
394        handler: ParseraEventHandler<T>
395    ): void {
396        const handlers = this.eventHandlers.get(eventType);
397        if (handlers) {
398            handlers.delete(handler as ParseraEventHandler<any>);
399        }
400    }
401
402    /**
403     * Removes all event handlers for a specific event type
404     */
405    removeAllListeners(eventType?: ParseraEventType): void {
406        if (eventType) {
407            this.eventHandlers.delete(eventType);
408        } else {
409            this.eventHandlers.clear();
410        }
411    }
412
413    private async emit<T = unknown>(
414        eventType: ParseraEventType,
415        data?: T,
416        error?: Error,
417        retryCount?: number
418    ): Promise<void> {
419        const handlers = this.eventHandlers.get(eventType);
420        if (!handlers?.size) return;
421
422        const event: ParseraEvent<T> = {
423            type: eventType,
424            timestamp: Date.now(),
425            data,
426            error,
427            retryCount,
428        };
429
430        const options = this.eventOptions.get(eventType) ?? {
431            async: false,
432            catchErrors: true,
433        };
434
435        const handleEvent = async (handler: ParseraEventHandler<any>) => {
436            try {
437                await handler(event);
438            } catch (error) {
439                if (!options.catchErrors) {
440                    throw error;
441                }
442                await this.emit("handler:error", undefined, error as Error);
443            }
444        };
445
446        if (options.async) {
447            handlers.forEach((handler) => {
448                handleEvent(handler).catch(() => {});
449            });
450        } else {
451            await Promise.all(
452                Array.from(handlers).map((handler) => handleEvent(handler))
453            );
454        }
455    }
456
457    /**
458     * Extracts data from a webpage using the Parsera API.
459     *
460     * @param options - Configuration options for the extraction
461     * @returns Promise resolving to an array of extracted data objects
462     *
463     * @throws {Error} When API key is invalid
464     * @throws {Error} When URL is invalid
465     * @throws {Error} When request times out
466     * @throws {Error} When rate limit is exceeded (after retries)
467     * @throws {Error} When no data is found
468     *
469     * @example
470     * ```typescript
471     * // Basic usage with attribute record
472     * const results = await parsera.extract({
473     *     url: "https://example.com/products",
474     *     attributes: {
475     *         title: "Extract the product title",
476     *         price: "Get the product price",
477     *     }
478     * });
479     *
480     * // Advanced usage with all options
481     * const results = await parsera.extract({
482     *     url: "https://example.com/products",
483     *     attributes: [
484     *         { name: "title", description: "Extract the product title" },
485     *         { name: "price", description: "Get the product price" }
486     *     ],
487     *     proxyCountry: "GB",
488     *     cookies: [
489     *         { name: "session", value: "abc123", sameSite: "Lax" }
490     *     ],
491     *     precisionMode: true,
492     *     signal: abortController.signal
493     * });
494     *
495     * // With request cancellation
496     * const controller = new AbortController();
497     * const promise = parsera.extract({
498     *     url: "https://example.com",
499     *     attributes: { title: "Extract the title" },
500     *     signal: controller.signal
501     * });
502     *
503     * // Cancel the request after 5 seconds
504     * setTimeout(() => controller.abort(), 5000);
505     * ```
506     *
507     * @example
508     * // Example return value:
509     * [
510     *     {
511     *         "title": "Product Name",
512     *         "price": "$99.99"
513     *     },
514     *     {
515     *         "title": "Another Product",
516     *         "price": "$149.99"
517     *     }
518     * ]
519     */
520    async extract({
521        url,
522        attributes,
523        proxyCountry,
524        cookies,
525        precisionMode,
526        signal,
527    }: ExtractOptions): Promise<Record<string, string>[]> {
528        await this.emit("extract:start", {
529            url,
530            attributes,
531            proxyCountry,
532            cookies,
533            precisionMode,
534            signal,
535        });
536
537        this.validateUrl(url);
538
539        try {
540            const requestBody: ParseraRequestBody = {
541                url,
542                attributes: Array.isArray(attributes)
543                    ? attributes
544                    : this.convertToAttributes(attributes),
545                proxy_country: proxyCountry || this.defaultProxyCountry,
546            };
547
548            if (cookies) {
549                requestBody.cookies = cookies;
550            }
551
552            if (precisionMode) {
553                requestBody.mode = "precision";
554            }
555
556            const response = await this.retryableRequest(() =>
557                this.fetchWithTimeout(`${this.baseUrl}/extract`, {
558                    method: "POST",
559                    headers: {
560                        "Content-Type": "application/json",
561                        "X-API-KEY": this.apiKey,
562                    },
563                    body: JSON.stringify(requestBody),
564                    signal,
565                })
566            );
567
568            if (!response.ok) {
569                await this.handleError(response);
570            }
571
572            const data = (await response.json()) as ParseraResponse;
573            if (!data.data?.length) {
574                throw new Error(
575                    data.message ||
576                        "No data returned from Parsera API. Make sure the website contains the data and the attribute descriptions are clear."
577                );
578            }
579
580            await this.emit("extract:complete", data);
581            return data.data;
582        } catch (error) {
583            if (error instanceof Error) {
584                await this.emit("extract:error", undefined, error);
585                throw new Error(`Failed to extract data: ${error.message}`);
586            }
587            throw new Error("Failed to extract data: Unknown error");
588        }
589    }
590
591    /**
592     * Alias for extract method to match Python library interface.
593     *
594     * @see {@link extract} for full documentation and examples
595     *
596     * @example
597     * ```typescript
598     * const results = await parsera.run({
599     *     url: "https://example.com",
600     *     attributes: { title: "Extract the title" }
601     * });
602     * ```
603     */
604    async run(options: ExtractOptions): Promise<Record<string, string>[]> {
605        return this.extract(options);
606    }
607
608    /**
609     * Alias for extract method to match Python library interface.
610     *
611     * @see {@link extract} for full documentation and examples
612     *
613     * @example
614     * ```typescript
615     * const results = await parsera.arun({
616     *     url: "https://example.com",
617     *     attributes: { title: "Extract the title" }
618     * });
619     * ```
620     */
621    async arun(options: ExtractOptions): Promise<Record<string, string>[]> {
622        return this.extract(options);
623    }
624
625    private async handleError(response: Response): Promise<never> {
626        const status = response.status;
627        const errorData = (await response.json()) as ParseraError;
628
629        switch (status) {
630            case 401:
631                throw new Error(
632                    "Invalid Parsera API key. Please check your credentials."
633                );
634            case 429:
635                throw new Error("Rate limit exceeded. Please try again later.");
636            case 400:
637                throw new Error(
638                    `Bad request: ${errorData?.message || "Unknown error"}`
639                );
640            default:
641                throw new Error(
642                    `Parsera API error: ${
643                        errorData?.message || response.statusText
644                    }`
645                );
646        }
647    }
648}

src/types/parsera.ts

1export interface BaseInput {
2    url: string;
3    apiKey: string;
4    attributes: {
5        name: string;
6        description: string;
7    }[];
8    proxyCountry?: string;
9    cookies?: {
10        [key: string]: string;
11        sameSite: "None" | "Lax" | "Strict";
12    }[];
13    precisionMode?: boolean;
14}
15
16export interface ParseraResponse {
17    /** Array of extracted data objects */
18    data: Record<string, string>[];
19    /** Message from the API */
20    message?: string;
21}
22
23export interface ParseraError {
24    /** Error message from the API */
25    message: string;
26    /** Error code for identifying the type of error */
27    code: string;
28}
29
30export interface ParseraRequestBody {
31    /** Target URL to extract data from */
32    url: string;
33    /** Array of attributes to extract */
34    attributes: {
35        name: string;
36        description: string;
37    }[];
38    /** Country for proxy IP */
39    proxy_country?: string;
40    /** Cookies to inject into the request */
41    cookies?: {
42        [key: string]: string;
43        sameSite: "None" | "Lax" | "Strict";
44    }[];
45    /** Extraction mode: "standard" or "precision" */
46    mode?: "standard" | "precision";
47}

src/utils/validation.test.ts

1import { describe, it, expect, vi, beforeEach } from "vitest";
2import { validateInput } from "./validation.js";
3import { createInputSchema } from "../schemas/input.js";
4import { z } from "zod";
5
6vi.mock("../schemas/input.js", () => ({
7    createInputSchema: vi.fn(),
8}));
9
10const createMockSchema = () => {
11    const CookieSchema = z
12        .object({
13            sameSite: z.enum(["None", "Lax", "Strict"]),
14        })
15        .catchall(z.string());
16
17    return z.object({
18        url: z.string().url(),
19        apiKey: z.string().min(1, "API key must not be empty"),
20        attributes: z
21            .array(
22                z.object({
23                    name: z.string().min(1, "Name must not be empty"),
24                    description: z
25                        .string()
26                        .min(1, "Description must not be empty"),
27                })
28            )
29            .min(1, "At least one attribute is required"),
30        proxyCountry: z.string().optional(),
31        cookies: z.array(CookieSchema).optional(),
32        precisionMode: z.boolean().optional(),
33    });
34};
35
36describe("validateInput", () => {
37    beforeEach(() => {
38        vi.clearAllMocks();
39    });
40
41    it("should validate correct minimal input successfully", async () => {
42        const validInput = {
43            url: "https://example.com",
44            apiKey: "test-api-key",
45            attributes: [
46                {
47                    name: "title",
48                    description: "The title of the page",
49                },
50            ],
51        };
52
53        const mockSchema = createMockSchema();
54        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
55
56        const result = await validateInput(validInput);
57        expect(result).toEqual(validInput);
58    });
59
60    it("should validate input with all optional fields", async () => {
61        const validInput = {
62            url: "https://example.com",
63            apiKey: "test-api-key",
64            attributes: [
65                {
66                    name: "title",
67                    description: "The title of the page",
68                },
69            ],
70            proxyCountry: "us",
71            cookies: [{ sameSite: "Lax", name: "test", value: "value" }],
72            precisionMode: true,
73        };
74
75        const mockSchema = createMockSchema();
76        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
77
78        const result = await validateInput(validInput);
79        expect(result).toEqual(validInput);
80    });
81
82    it("should throw error for missing required fields", async () => {
83        const invalidInput = {
84            url: "https://example.com",
85            // missing apiKey
86            attributes: [],
87        };
88
89        const mockSchema = createMockSchema();
90        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
91
92        await expect(validateInput(invalidInput)).rejects.toThrow();
93    });
94
95    it("should throw error for invalid URL format", async () => {
96        const invalidInput = {
97            url: "not-a-url",
98            apiKey: "test-api-key",
99            attributes: [
100                {
101                    name: "title",
102                    description: "The title of the page",
103                },
104            ],
105        };
106
107        const mockSchema = createMockSchema();
108        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
109
110        await expect(validateInput(invalidInput)).rejects.toThrow();
111    });
112
113    it("should throw error with formatted message for empty attributes array", async () => {
114        const invalidInput = {
115            url: "https://example.com",
116            apiKey: "test-api-key",
117            attributes: [],
118        };
119
120        const mockSchema = createMockSchema();
121        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
122
123        await expect(async () => {
124            await validateInput(invalidInput);
125        }).rejects.toThrow("Input validation failed");
126
127        await expect(async () => {
128            await validateInput(invalidInput);
129        }).rejects.toThrow("At least one attribute is required");
130    });
131
132    it("should throw error with formatted message for empty strings", async () => {
133        const invalidInput = {
134            url: "https://example.com",
135            apiKey: "test-api-key",
136            attributes: [{ name: "", description: "" }],
137        };
138
139        const mockSchema = createMockSchema();
140        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
141
142        await expect(async () => {
143            await validateInput(invalidInput);
144        }).rejects.toThrow("Input validation failed");
145
146        await expect(async () => {
147            await validateInput(invalidInput);
148        }).rejects.toThrow("Name must not be empty");
149
150        await expect(async () => {
151            await validateInput(invalidInput);
152        }).rejects.toThrow("Description must not be empty");
153    });
154
155    it("should validate input with invalid cookie format", async () => {
156        const invalidInput = {
157            url: "https://example.com",
158            apiKey: "test-api-key",
159            attributes: [{ name: "title", description: "test" }],
160            cookies: [{ sameSite: "Invalid" }], // Invalid sameSite value
161        };
162
163        const mockSchema = createMockSchema();
164        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
165
166        await expect(validateInput(invalidInput)).rejects.toThrow();
167    });
168
169    it("should handle multiple attributes correctly", async () => {
170        const validInput = {
171            url: "https://example.com",
172            apiKey: "test-api-key",
173            attributes: [
174                {
175                    name: "title",
176                    description: "The title of the page",
177                },
178                {
179                    name: "price",
180                    description: "Product price",
181                },
182                {
183                    name: "description",
184                    description: "Product description",
185                },
186            ],
187        };
188
189        const mockSchema = createMockSchema();
190        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
191
192        const result = await validateInput(validInput);
193        expect(result).toEqual(validInput);
194    });
195
196    it("should handle multiple cookies correctly", async () => {
197        const validInput = {
198            url: "https://example.com",
199            apiKey: "test-api-key",
200            attributes: [{ name: "title", description: "test" }],
201            cookies: [
202                { sameSite: "Lax", name: "session", value: "123" },
203                { sameSite: "Strict", name: "preference", value: "dark" },
204                { sameSite: "None", name: "tracking", value: "allowed" },
205            ],
206        };
207
208        const mockSchema = createMockSchema();
209        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
210
211        const result = await validateInput(validInput);
212        expect(result).toEqual(validInput);
213    });
214
215    it("should validate extremely long URLs", async () => {
216        const longUrl = "https://example.com/" + "a".repeat(2000);
217        const validInput = {
218            url: longUrl,
219            apiKey: "test-api-key",
220            attributes: [{ name: "title", description: "test" }],
221        };
222
223        const mockSchema = createMockSchema();
224        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
225
226        const result = await validateInput(validInput);
227        expect(result).toEqual(validInput);
228    });
229
230    it("should handle special characters in API key", async () => {
231        const validInput = {
232            url: "https://example.com",
233            apiKey: "test-api-key!@#$%^&*()_+-=[]{}|;:,.<>?",
234            attributes: [{ name: "title", description: "test" }],
235        };
236
237        const mockSchema = createMockSchema();
238        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
239
240        const result = await validateInput(validInput);
241        expect(result).toEqual(validInput);
242    });
243
244    it("should handle unicode characters in attributes", async () => {
245        const validInput = {
246            url: "https://example.com",
247            apiKey: "test-api-key",
248            attributes: [
249                {
250                    name: "título",
251                    description: "描述 - 説明 - 설명",
252                },
253            ],
254        };
255
256        const mockSchema = createMockSchema();
257        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
258
259        const result = await validateInput(validInput);
260        expect(result).toEqual(validInput);
261    });
262
263    it("should validate all proxy country options", async () => {
264        const validInputs = [
265            {
266                url: "https://example.com",
267                apiKey: "test-api-key",
268                attributes: [{ name: "title", description: "test" }],
269                proxyCountry: "us",
270            },
271            {
272                url: "https://example.com",
273                apiKey: "test-api-key",
274                attributes: [{ name: "title", description: "test" }],
275                proxyCountry: "random",
276            },
277        ];
278
279        const mockSchema = createMockSchema();
280        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
281
282        for (const input of validInputs) {
283            const result = await validateInput(input);
284            expect(result).toEqual(input);
285        }
286    });
287
288    it("should handle boolean precision mode values correctly", async () => {
289        const validInputs = [
290            {
291                url: "https://example.com",
292                apiKey: "test-api-key",
293                attributes: [{ name: "title", description: "test" }],
294                precisionMode: true,
295            },
296            {
297                url: "https://example.com",
298                apiKey: "test-api-key",
299                attributes: [{ name: "title", description: "test" }],
300                precisionMode: false,
301            },
302        ];
303
304        const mockSchema = createMockSchema();
305        vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
306
307        for (const input of validInputs) {
308            const result = await validateInput(input);
309            expect(result).toEqual(input);
310        }
311    });
312});

src/utils/validation.ts

1import { z } from "zod";
2import { BaseInput } from "../types/parsera.js";
3import { createInputSchema } from "../schemas/input.js";
4
5/**
6 * Validates the input configuration for the Actor using Zod
7 * @param input - The input object to validate
8 * @throws {Error} If any validation fails with detailed error messages
9 * @returns The validated input object
10 */
11export const validateInput = async (input: unknown): Promise<BaseInput> => {
12    try {
13        const schema = await createInputSchema();
14        return schema.parse(input) as BaseInput;
15    } catch (error) {
16        if (error instanceof z.ZodError) {
17            const errorMessages = error.errors
18                .map((err) => `${err.path.join(".")}: ${err.message}`)
19                .join("\n");
20            throw new Error(`Input validation failed:\n${errorMessages}`);
21        }
22        throw error;
23    }
24};

src/main.ts

1import { Actor, log } from "apify";
2import { validateInput } from "./utils/validation.js";
3import { ExtractOptions, Parsera } from "./services/parsera.js";
4import type { ParseraResponse } from "./types/parsera.js";
5
6await Actor.init();
7
8try {
9    const input = await validateInput(await Actor.getInput());
10
11    const parsera = new Parsera({
12        apiKey: input.apiKey,
13        timeout: 60000,
14        retryOptions: {
15            maxRetries: 3,
16            backoffFactor: 2,
17            initialDelay: 1000,
18        },
19        defaultProxyCountry: input.proxyCountry || "UnitedStates",
20    });
21
22    // Set up event handlers with proper typing
23    parsera.on<ExtractOptions>("extract:start", (event) => {
24        log.info("Starting extraction process", {
25            url: event.data?.url,
26            attributes: event.data?.attributes,
27        });
28    });
29
30    parsera.on("request:retry", (event) => {
31        log.warning("Retrying request", {
32            attemptNumber: (event.retryCount ?? 0) + 1,
33            timestamp: new Date(event.timestamp).toISOString(),
34        });
35    });
36
37    parsera.on("rateLimit", () => {
38        log.warning("Rate limit hit, backing off...");
39    });
40
41    parsera.on("request:error", (event) => {
42        log.error("Request failed", {
43            error: event.error?.message,
44            timestamp: new Date(event.timestamp).toISOString(),
45        });
46    });
47
48    parsera.on<ParseraResponse>("extract:complete", (event) => {
49        log.info("Extraction completed successfully", {
50            itemCount: event.data?.data?.length ?? 0,
51            timestamp: new Date(event.timestamp).toISOString(),
52        });
53    });
54
55    parsera.on("timeout", (event) => {
56        log.warning("Request timed out", {
57            error: event.error?.message,
58            timestamp: new Date(event.timestamp).toISOString(),
59        });
60    });
61
62    // Set up global timeout
63    const controller = new AbortController();
64    const timeout = setTimeout(() => {
65        controller.abort();
66        log.warning("Extraction timed out after 5 minutes");
67    }, 5 * 60 * 1000);
68
69    try {
70        const extractedData = await parsera.extract({
71            ...input,
72            signal: controller.signal,
73            precisionMode: false, // Default to false
74        });
75
76        await Actor.pushData(extractedData);
77
78        log.info("Actor finished successfully", {
79            extractedItemCount: extractedData.length,
80            lastRunAt: new Date().toISOString(),
81        });
82    } finally {
83        clearTimeout(timeout);
84        parsera.removeAllListeners(); // Clean up event listeners
85    }
86} catch (error) {
87    if (error instanceof Error && error.name === "AbortError") {
88        log.error("Extraction was aborted", {
89            error: error.message,
90            timestamp: new Date().toISOString(),
91        });
92    } else {
93        log.error("Actor failed", {
94            error: error instanceof Error ? error.message : "Unknown error",
95            timestamp: new Date().toISOString(),
96        });
97    }
98    throw error;
99} finally {
100    await Actor.exit();
101}

.dockerignore

1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
15
16# dist folder
17dist

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "root": true,
3    "env": {
4        "browser": true,
5        "es2020": true,
6        "node": true
7    },
8    "extends": [
9        "@apify/eslint-config-ts"
10    ],
11    "parserOptions": {
12        "project": "./tsconfig.json",
13        "ecmaVersion": 2020
14    },
15    "ignorePatterns": [
16        "node_modules",
17        "dist",
18        "**/*.d.ts"
19    ]
20}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.vscode
5storage
6apify_storage
7crawlee_storage
8node_modules
9dist
10tsconfig.tsbuildinfo
11storage/*
12!storage/key_value_stores
13storage/key_value_stores/*
14!storage/key_value_stores/default
15storage/key_value_stores/default/*
16!storage/key_value_stores/default/INPUT.json
17
18# Added by Apify CLI
19.venv

package.json

1{
2    "name": "apify-actor-parsera",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "Extract data from any webpage with AI using Parsera",
6    "engines": {
7        "node": ">=18.0.0"
8    },
9    "dependencies": {
10        "apify": "^3.2.6",
11        "cheerio": "^1.0.0-rc.12",
12        "zod": "^3.23.8"
13    },
14    "devDependencies": {
15        "@apify/eslint-config-ts": "^0.3.0",
16        "@apify/tsconfig": "^0.1.0",
17        "@typescript-eslint/eslint-plugin": "^7.18.0",
18        "@typescript-eslint/parser": "^7.18.0",
19        "eslint": "^8.50.0",
20        "tsx": "^4.6.2",
21        "typescript": "^5.3.3",
22        "vitest": "^2.1.6"
23    },
24    "scripts": {
25        "start": "npm run start:dev",
26        "start:prod": "node dist/main.js",
27        "start:dev": "tsx src/main.ts",
28        "build": "tsc",
29        "test": "vitest run",
30        "test:watch": "vitest",
31        "test:coverage": "vitest run --coverage"
32    },
33    "author": "Doug Silkstone <doug@parsera.org>",
34    "license": "ISC"
35}

tsconfig.json

1{
2    "extends": "@apify/tsconfig",
3    "compilerOptions": {
4        "module": "NodeNext",
5        "moduleResolution": "NodeNext",
6        "target": "ES2022",
7        "outDir": "dist",
8        "noUnusedLocals": false,
9        "skipLibCheck": true,
10        "lib": ["DOM"]
11    },
12    "include": [
13        "./src/**/*"
14    ]
15}

vitest.config.ts

1import { defineConfig } from "vitest/config";
2
3export default defineConfig({
4    test: {
5        globals: true,
6        environment: "node",
7        include: ["**/*.test.ts"],
8        coverage: {
9            provider: "v8",
10            reporter: ["text", "json", "html"],
11        },
12    },
13});
Developer
Maintained by Community

Actor Metrics

  • 1 monthly user

  • 1 star

  • Created in Nov 2024

  • Modified 18 hours ago

Categories