Parsera
Try for free
No credit card required
Go to Store
Parsera
parsera-labs/parsera
Try for free
No credit card required
Extract Data from ANY website with Parsera.org
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20 AS builder
5
6# Check preinstalled packages
7RUN npm ls crawlee apify puppeteer playwright
8
9# Copy just package.json and package-lock.json
10# to speed up the build using Docker layer cache.
11COPY package*.json ./
12
13# Install all dependencies. Don't audit to speed up the installation.
14RUN npm install --include=dev --audit=false
15
16# Next, copy the source files using the user set
17# in the base image.
18COPY . ./
19
20# Install all dependencies and build the project.
21# Don't audit to speed up the installation.
22RUN npm run build
23
24# Create final image
25FROM apify/actor-node:20
26
27# Check preinstalled packages
28RUN npm ls crawlee apify puppeteer playwright
29
30# Copy just package.json and package-lock.json
31# to speed up the build using Docker layer cache.
32COPY package*.json ./
33
34# Install NPM packages, skip optional and development dependencies to
35# keep the image small. Avoid logging too much and print the dependency
36# tree for debugging
37RUN npm --quiet set progress=false \
38 && npm install --omit=dev --omit=optional \
39 && echo "Installed NPM packages:" \
40 && (npm list --omit=dev --all || true) \
41 && echo "Node.js version:" \
42 && node --version \
43 && echo "NPM version:" \
44 && npm --version \
45 && rm -r ~/.npm
46
47# Copy built JS files from builder image
48COPY /usr/src/app/dist ./dist
49
50# Next, copy the remaining files and directories with the source code.
51# Since we do this after NPM install, quick build will be really fast
52# for most source file changes.
53COPY . ./
54
55
56# Run the image.
57CMD npm run start:prod --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "parsera",
4 "title": "Parsera Data Extractor",
5 "description": "Extract structured data from any webpage using AI-powered Parsera API",
6 "version": "1.0",
7 "buildTag": "latest",
8 "minMemoryMbytes": 256,
9 "maxMemoryMbytes": 4096,
10 "dockerfile": "./Dockerfile",
11 "input": "./input_schema.json"
12}
.actor/input_schema.json
1{
2 "description": "Configure the data extraction parameters for Parsera API",
3 "properties": {
4 "apiKey": {
5 "description": "Your Parsera API key for authentication",
6 "editor": "textfield",
7 "isSecret": true,
8 "title": "Parsera API Key",
9 "type": "string"
10 },
11 "attributes": {
12 "description": "Define what data to extract from the webpage",
13 "editor": "json",
14 "prefill": [
15 {
16 "description": "The main title of the page",
17 "name": "title"
18 }
19 ],
20 "sectionCaption": "Extraction Settings",
21 "sectionDescription": "Configure what data to extract from the webpage",
22 "title": "Extraction Attributes",
23 "type": "array"
24 },
25 "cookies": {
26 "description": "Custom cookies to use for the request (optional)",
27 "editor": "json",
28 "nullable": true,
29 "title": "Custom Cookies",
30 "type": "array"
31 },
32 "precisionMode": {
33 "default": false,
34 "description": "Enable for more accurate but slower extraction",
35 "nullable": true,
36 "title": "Precision Mode",
37 "type": "boolean"
38 },
39 "proxyCountry": {
40 "description": "Country for the proxy IP (optional)",
41 "editor": "select",
42 "enum": [
43 "random",
44 "Afghanistan",
45 "Albania",
46 "Algeria",
47 "Argentina",
48 "Armenia",
49 "Aruba",
50 "Australia",
51 "Austria",
52 "Azerbaijan",
53 "Bahamas",
54 "Bahrain",
55 "Bangladesh",
56 "Belarus",
57 "Belgium",
58 "BosniaandHerzegovina",
59 "Brazil",
60 "BritishVirginIslands",
61 "Brunei",
62 "Bulgaria",
63 "Cambodia",
64 "Cameroon",
65 "Canada",
66 "Chile",
67 "China",
68 "Colombia",
69 "CostaRica",
70 "Croatia",
71 "Cuba",
72 "Cyprus",
73 "Chechia",
74 "Denmark",
75 "DominicanRepublic",
76 "Ecuador",
77 "Egypt",
78 "ElSalvador",
79 "Estonia",
80 "Ethiopia",
81 "Finland",
82 "France",
83 "Georgia",
84 "Germany",
85 "Ghana",
86 "Greece",
87 "Guatemala",
88 "Guyana",
89 "HashemiteKingdomofJordan",
90 "HongKong",
91 "Hungary",
92 "India",
93 "Indonesia",
94 "Iran",
95 "Iraq",
96 "Ireland",
97 "Israel",
98 "Italy",
99 "Jamaica",
100 "Japan",
101 "Kazakhstan",
102 "Kenya",
103 "Kosovo",
104 "Kuwait",
105 "Latvia",
106 "Liechtenstein",
107 "Luxembourg",
108 "Macedonia",
109 "Madagascar",
110 "Malaysia",
111 "Mauritius",
112 "Mexico",
113 "Mongolia",
114 "Montenegro",
115 "Morocco",
116 "Mozambique",
117 "Myanmar",
118 "Nepal",
119 "Netherlands",
120 "NewZealand",
121 "Nigeria",
122 "Norway",
123 "Oman",
124 "Pakistan",
125 "Palestine",
126 "Panama",
127 "PapuaNewGuinea",
128 "Paraguay",
129 "Peru",
130 "Philippines",
131 "Poland",
132 "Portugal",
133 "PuertoRico",
134 "Qatar",
135 "RepublicOfLithuania",
136 "RepublicOfMoldova",
137 "Romania",
138 "Russia",
139 "SaudiArabia",
140 "Senegal",
141 "Serbia",
142 "Seychelles",
143 "Singapore",
144 "Slovakia",
145 "Slovenia",
146 "Somalia",
147 "SouthAfrica",
148 "SouthKorea",
149 "Spain",
150 "SriLanka",
151 "Sudan",
152 "Suriname",
153 "Sweden",
154 "Switzerland",
155 "Syria",
156 "Taiwan",
157 "Tajikistan",
158 "Thailand",
159 "TrinidadandTobago",
160 "Tunisia",
161 "Turkey",
162 "Uganda",
163 "Ukraine",
164 "UnitedArabEmirates",
165 "UnitedKingdom",
166 "UnitedStates",
167 "Uzbekistan",
168 "Venezuela",
169 "Vietnam",
170 "Zambia"
171 ],
172 "enumTitles": [
173 "Random Country",
174 "Afghanistan",
175 "Albania",
176 "Algeria",
177 "Argentina",
178 "Armenia",
179 "Aruba",
180 "Australia",
181 "Austria",
182 "Azerbaijan",
183 "Bahamas",
184 "Bahrain",
185 "Bangladesh",
186 "Belarus",
187 "Belgium",
188 "Bosnia and Herzegovina",
189 "Brazil",
190 "British Virgin Islands",
191 "Brunei",
192 "Bulgaria",
193 "Cambodia",
194 "Cameroon",
195 "Canada",
196 "Chile",
197 "China",
198 "Colombia",
199 "Costa Rica",
200 "Croatia",
201 "Cuba",
202 "Cyprus",
203 "Chechia",
204 "Denmark",
205 "Dominican Republic",
206 "Ecuador",
207 "Egypt",
208 "El Salvador",
209 "Estonia",
210 "Ethiopia",
211 "Finland",
212 "France",
213 "Georgia",
214 "Germany",
215 "Ghana",
216 "Greece",
217 "Guatemala",
218 "Guyana",
219 "Hashemite Kingdom of Jordan",
220 "Hong Kong",
221 "Hungary",
222 "India",
223 "Indonesia",
224 "Iran",
225 "Iraq",
226 "Ireland",
227 "Israel",
228 "Italy",
229 "Jamaica",
230 "Japan",
231 "Kazakhstan",
232 "Kenya",
233 "Kosovo",
234 "Kuwait",
235 "Latvia",
236 "Liechtenstein",
237 "Luxembourg",
238 "Macedonia",
239 "Madagascar",
240 "Malaysia",
241 "Mauritius",
242 "Mexico",
243 "Mongolia",
244 "Montenegro",
245 "Morocco",
246 "Mozambique",
247 "Myanmar",
248 "Nepal",
249 "Netherlands",
250 "New Zealand",
251 "Nigeria",
252 "Norway",
253 "Oman",
254 "Pakistan",
255 "Palestine",
256 "Panama",
257 "Papua New Guinea",
258 "Paraguay",
259 "Peru",
260 "Philippines",
261 "Poland",
262 "Portugal",
263 "Puerto Rico",
264 "Qatar",
265 "Republic of Lithuania",
266 "Republic of Moldova",
267 "Romania",
268 "Russia",
269 "Saudi Arabia",
270 "Senegal",
271 "Serbia",
272 "Seychelles",
273 "Singapore",
274 "Slovakia",
275 "Slovenia",
276 "Somalia",
277 "South Africa",
278 "South Korea",
279 "Spain",
280 "Sri Lanka",
281 "Sudan",
282 "Suriname",
283 "Sweden",
284 "Switzerland",
285 "Syria",
286 "Taiwan",
287 "Tajikistan",
288 "Thailand",
289 "Trinidad and Tobago",
290 "Tunisia",
291 "Turkey",
292 "Uganda",
293 "Ukraine",
294 "United Arab Emirates",
295 "United Kingdom",
296 "United States",
297 "Uzbekistan",
298 "Venezuela",
299 "Vietnam",
300 "Zambia"
301 ],
302 "nullable": true,
303 "sectionCaption": "Advanced Options",
304 "sectionDescription": "Optional settings for customizing the extraction process",
305 "title": "Proxy Location",
306 "type": "string"
307 },
308 "url": {
309 "description": "The webpage URL to extract data from",
310 "editor": "textfield",
311 "pattern": "^https?://[^\\s/$.?#].[^\\s]*$",
312 "sectionCaption": "Basic Configuration",
313 "sectionDescription": "Essential settings for data extraction",
314 "title": "Target URL",
315 "type": "string"
316 }
317 },
318 "required": ["url", "apiKey", "attributes"],
319 "schemaVersion": 1,
320 "title": "Parsera Data Extractor Input",
321 "type": "object"
322}
src/config/constants.ts
1export const PARSERA_API_BASE_URL = "https://api.parsera.org/v1";
2export const DEFAULT_PROXY_COUNTRY = "UnitedStates";
src/schemas/input.ts
1import { z } from "zod";
2import { PARSERA_API_BASE_URL } from "../config/constants.js";
3import { log } from "apify";
4
5export const AttributeSchema = z.object({
6 name: z.string().min(1, "Attribute name must not be empty"),
7 description: z.string().min(1, "Attribute description must not be empty"),
8});
9
10export const CookieSchema = z
11 .object({
12 sameSite: z.enum(["None", "Lax", "Strict"]),
13 })
14 .catchall(z.string());
15
16/**
17 * Fetches the list of valid proxy countries from Parsera API
18 * @returns A Zod enum schema of valid proxy countries
19 */
20export const getProxyCountriesSchema = async () => {
21 try {
22 const response = await fetch(`${PARSERA_API_BASE_URL}/proxy-countries`);
23 if (!response.ok) {
24 throw new Error(
25 `Failed to fetch proxy countries: ${response.statusText}`
26 );
27 }
28 const data = (await response.json()) as Record<string, string>;
29 const proxyCountries = Object.keys(data);
30 return z.enum(["random", ...proxyCountries] as [string, ...string[]]);
31 } catch (error) {
32 log.warning(
33 "Failed to fetch proxy countries. Using default validation.",
34 { error }
35 );
36 // Fallback to basic string validation if API call fails
37 return z.string();
38 }
39};
40
41// Create the input schema with async proxy country validation
42export const createInputSchema = async () => {
43 const proxyCountriesSchema = await getProxyCountriesSchema();
44
45 return z.object({
46 url: z.string().url("Invalid URL format"),
47 apiKey: z.string().min(1, "API key must not be empty"),
48 attributes: z
49 .array(AttributeSchema)
50 .min(1, "At least one attribute is required"),
51 proxyCountry: proxyCountriesSchema.optional(),
52 cookies: z.array(CookieSchema).optional(),
53 precisionMode: z.boolean().optional(),
54 });
55};
src/services/parsera.ts
1import {
2 ParseraError,
3 ParseraRequestBody,
4 ParseraResponse,
5} from "../types/parsera.js";
6
7export interface ParseraRetryOptions {
8 /**
9 * Maximum number of retry attempts for failed requests
10 * Applies to network errors, timeouts, and rate limits
11 * @default 3
12 */
13 maxRetries?: number;
14
15 /**
16 * Multiplier for exponential backoff between retries
17 * Each retry will wait: initialDelay * (backoffFactor ^ retryCount)
18 * Example: 1000ms, then 2000ms, then 4000ms
19 * @default 2
20 */
21 backoffFactor?: number;
22
23 /**
24 * Initial delay (in milliseconds) before the first retry
25 * This delay will be multiplied by backoffFactor for subsequent retries
26 * @default 1000 (1 second)
27 */
28 initialDelay?: number;
29}
30
31export interface ParseraAttribute {
32 /**
33 * Name of the attribute to extract
34 * This will be the key in the returned data object
35 */
36 name: string;
37
38 /**
39 * Natural language description of what to extract
40 * Be as specific as possible about the data you want
41 */
42 description: string;
43}
44
45export interface ParseraCookie {
46 /**
47 * Cookie properties as key-value pairs
48 */
49 [key: string]: string;
50
51 /**
52 * SameSite attribute for the cookie
53 * Controls how the cookie behaves with cross-site requests
54 */
55 sameSite: "None" | "Lax" | "Strict";
56}
57
58export interface ParseraOptions {
59 /**
60 * Your Parsera API key
61 * Required for authentication with the API
62 */
63 apiKey: string;
64
65 /**
66 * Base URL for the Parsera API
67 * Only change this if you're using a custom API endpoint
68 * @default "https://api.parsera.org/v1"
69 */
70 baseUrl?: string;
71
72 /**
73 * Default country code for proxy servers
74 * Used when no specific proxyCountry is provided in the request
75 * @example "random" | "UnitedStates" | "UnitedKingdom" | "Germany" | "Japan" | "France" | "Canada"
76 * @default "UnitedStates"
77 * @see https://api.parsera.org/v1/proxy-countries for full list of supported countries
78 */
79 defaultProxyCountry?: string;
80
81 /**
82 * Maximum time (in milliseconds) to wait for each API request
83 * If a request takes longer, it will be aborted and retried
84 * @default 30000 (30 seconds)
85 */
86 timeout?: number;
87
88 /**
89 * Configuration for retry behavior on failed requests
90 * @see ParseraRetryOptions
91 */
92 retryOptions?: ParseraRetryOptions;
93}
94
95export interface ExtractOptions {
96 /**
97 * URL of the webpage to extract data from
98 * Must be a valid HTTP/HTTPS URL
99 */
100 url: string;
101
102 /**
103 * Attributes to extract from the webpage
104 * Can be either an array of ParseraAttribute objects
105 * or a Record of name-description pairs
106 */
107 attributes: ParseraAttribute[] | Record<string, string>;
108
109 /**
110 * Country code for proxy server location
111 * Overrides the defaultProxyCountry setting
112 * @example "US" | "GB" | "DE" | "JP"
113 */
114 proxyCountry?: string;
115
116 /**
117 * Cookies to be sent with the request
118 * Useful for accessing pages that require authentication
119 */
120 cookies?: ParseraCookie[];
121
122 /**
123 * Enable precision mode for more accurate extractions
124 * May increase processing time
125 * @default false
126 */
127 precisionMode?: boolean;
128
129 /**
130 * AbortSignal for request cancellation
131 * Allows the request to be aborted if it exceeds timeout
132 * or if cancellation is needed
133 */
134 signal?: AbortSignal;
135}
136
137export type ParseraEventType =
138 | "request:start"
139 | "request:end"
140 | "request:retry"
141 | "request:error"
142 | "extract:start"
143 | "extract:complete"
144 | "extract:error"
145 | "rateLimit"
146 | "timeout"
147 | string; // Allow custom event types
148
149export interface ParseraEvent<T = unknown> {
150 type: ParseraEventType;
151 timestamp: number;
152 data?: T;
153 error?: Error;
154 retryCount?: number;
155}
156
157export type ParseraEventHandler<T = unknown> = (
158 event: ParseraEvent<T>
159) => void | Promise<void>;
160
161export interface ParseraEventOptions {
162 /**
163 * Whether to handle the event asynchronously
164 * When true, event handlers won't block the main execution
165 * @default false
166 */
167 async?: boolean;
168
169 /**
170 * Whether to catch errors in event handlers
171 * When true, errors in handlers won't affect the main execution
172 * @default true
173 */
174 catchErrors?: boolean;
175}
176
177export class Parsera {
178 private readonly apiKey: string;
179 private readonly baseUrl: string;
180 private readonly defaultProxyCountry: string;
181 private readonly timeout: number;
182 private readonly retryOptions: Required<ParseraRetryOptions>;
183 private lastRequestTime: number = 0;
184 private readonly minRequestInterval: number = 100; // 100ms between requests
185 private readonly eventHandlers = new Map<
186 ParseraEventType,
187 Set<ParseraEventHandler<any>>
188 >();
189 private readonly eventOptions = new Map<
190 ParseraEventType,
191 ParseraEventOptions
192 >();
193
194 /**
195 * Creates a new Parsera client instance.
196 *
197 * @example
198 * ```typescript
199 * const parsera = new Parsera({
200 * apiKey: "your-api-key",
201 * timeout: 60000, // 60 second timeout
202 * retryOptions: {
203 * maxRetries: 3,
204 * backoffFactor: 2,
205 * initialDelay: 1000,
206 * }
207 * });
208 * ```
209 */
210 constructor({
211 apiKey,
212 baseUrl = "https://api.parsera.org/v1",
213 defaultProxyCountry = "US",
214 timeout = 30000,
215 retryOptions = {},
216 }: ParseraOptions) {
217 this.validateApiKey(apiKey);
218 this.apiKey = apiKey;
219 this.baseUrl = baseUrl;
220 this.defaultProxyCountry = defaultProxyCountry;
221 this.timeout = timeout;
222 this.retryOptions = {
223 maxRetries: retryOptions.maxRetries ?? 3,
224 backoffFactor: retryOptions.backoffFactor ?? 2,
225 initialDelay: retryOptions.initialDelay ?? 1000,
226 };
227 }
228
229 private validateApiKey(apiKey: string): void {
230 if (!apiKey || typeof apiKey !== "string" || apiKey.length < 32) {
231 throw new Error("Invalid API key format");
232 }
233 }
234
235 private validateUrl(url: string): void {
236 try {
237 new URL(url);
238 } catch {
239 throw new Error("Invalid URL format");
240 }
241 }
242
243 private async enforceRateLimit(): Promise<void> {
244 const now = Date.now();
245 const timeSinceLastRequest = now - this.lastRequestTime;
246
247 if (timeSinceLastRequest < this.minRequestInterval) {
248 await new Promise((resolve) =>
249 setTimeout(
250 resolve,
251 this.minRequestInterval - timeSinceLastRequest
252 )
253 );
254 }
255 this.lastRequestTime = Date.now();
256 }
257
258 private async fetchWithTimeout(
259 url: string,
260 options: RequestInit & { timeout?: number }
261 ): Promise<Response> {
262 const { timeout = this.timeout, ...fetchOptions } = options;
263
264 const controller = new AbortController();
265 if (options.signal) {
266 options.signal.addEventListener("abort", () => controller.abort());
267 }
268
269 const timeoutId = setTimeout(() => controller.abort(), timeout);
270
271 try {
272 const response = await fetch(url, {
273 ...fetchOptions,
274 signal: controller.signal,
275 });
276 return response;
277 } finally {
278 clearTimeout(timeoutId);
279 }
280 }
281
282 private async retryableRequest(
283 requestFn: () => Promise<Response>,
284 retryCount = 0
285 ): Promise<Response> {
286 try {
287 await this.enforceRateLimit();
288 const response = await requestFn();
289
290 if (
291 response.status === 429 &&
292 retryCount < this.retryOptions.maxRetries
293 ) {
294 await this.emit("rateLimit", { retryCount });
295 await this.emit("request:retry", { retryCount });
296
297 const delay =
298 this.retryOptions.initialDelay *
299 Math.pow(this.retryOptions.backoffFactor, retryCount);
300 await new Promise((resolve) => setTimeout(resolve, delay));
301 return this.retryableRequest(requestFn, retryCount + 1);
302 }
303
304 return response;
305 } catch (error) {
306 if (error instanceof Error) {
307 if (error.name === "AbortError") {
308 await this.emit("timeout", undefined, error);
309 }
310 await this.emit("request:error", undefined, error);
311
312 if (
313 retryCount < this.retryOptions.maxRetries &&
314 this.isRetryableError(error)
315 ) {
316 await this.emit("request:retry", { retryCount });
317 const delay =
318 this.retryOptions.initialDelay *
319 Math.pow(this.retryOptions.backoffFactor, retryCount);
320 await new Promise((resolve) => setTimeout(resolve, delay));
321 return this.retryableRequest(requestFn, retryCount + 1);
322 }
323 }
324 throw error;
325 }
326 }
327
328 private isRetryableError(error: unknown): boolean {
329 return (
330 error instanceof Error &&
331 (error.message.includes("network") ||
332 error.message.includes("timeout") ||
333 error.message.includes("ECONNRESET"))
334 );
335 }
336
337 /**
338 * Converts a Record<string, string> to ParseraAttribute[]
339 */
340 private convertToAttributes(
341 attrs: Record<string, string>
342 ): ParseraAttribute[] {
343 return Object.entries(attrs).map(([name, description]) => ({
344 name,
345 description,
346 }));
347 }
348
349 /**
350 * Registers an event handler for a specific event type
351 *
352 * @param eventType - Type of event to listen for
353 * @param handler - Function to handle the event
354 * @param options - Configuration options for event handling
355 *
356 * @example
357 * ```typescript
358 * parsera.on('extract:complete', (event) => {
359 * console.log(`Extraction completed with ${event.data.length} items`);
360 * });
361 *
362 * parsera.on('request:retry', (event) => {
363 * console.log(`Retrying request (attempt ${event.retryCount})`);
364 * });
365 *
366 * // Custom event
367 * parsera.on('my:custom:event', (event) => {
368 * console.log('Custom event data:', event.data);
369 * });
370 * ```
371 */
372 on<T = unknown>(
373 eventType: ParseraEventType,
374 handler: ParseraEventHandler<T>,
375 options: ParseraEventOptions = {}
376 ): void {
377 if (!this.eventHandlers.has(eventType)) {
378 this.eventHandlers.set(eventType, new Set());
379 }
380 this.eventHandlers
381 .get(eventType)!
382 .add(handler as ParseraEventHandler<any>);
383 this.eventOptions.set(eventType, {
384 async: options.async ?? false,
385 catchErrors: options.catchErrors ?? true,
386 });
387 }
388
389 /**
390 * Removes an event handler for a specific event type
391 */
392 off<T = unknown>(
393 eventType: ParseraEventType,
394 handler: ParseraEventHandler<T>
395 ): void {
396 const handlers = this.eventHandlers.get(eventType);
397 if (handlers) {
398 handlers.delete(handler as ParseraEventHandler<any>);
399 }
400 }
401
402 /**
403 * Removes all event handlers for a specific event type
404 */
405 removeAllListeners(eventType?: ParseraEventType): void {
406 if (eventType) {
407 this.eventHandlers.delete(eventType);
408 } else {
409 this.eventHandlers.clear();
410 }
411 }
412
413 private async emit<T = unknown>(
414 eventType: ParseraEventType,
415 data?: T,
416 error?: Error,
417 retryCount?: number
418 ): Promise<void> {
419 const handlers = this.eventHandlers.get(eventType);
420 if (!handlers?.size) return;
421
422 const event: ParseraEvent<T> = {
423 type: eventType,
424 timestamp: Date.now(),
425 data,
426 error,
427 retryCount,
428 };
429
430 const options = this.eventOptions.get(eventType) ?? {
431 async: false,
432 catchErrors: true,
433 };
434
435 const handleEvent = async (handler: ParseraEventHandler<any>) => {
436 try {
437 await handler(event);
438 } catch (error) {
439 if (!options.catchErrors) {
440 throw error;
441 }
442 await this.emit("handler:error", undefined, error as Error);
443 }
444 };
445
446 if (options.async) {
447 handlers.forEach((handler) => {
448 handleEvent(handler).catch(() => {});
449 });
450 } else {
451 await Promise.all(
452 Array.from(handlers).map((handler) => handleEvent(handler))
453 );
454 }
455 }
456
457 /**
458 * Extracts data from a webpage using the Parsera API.
459 *
460 * @param options - Configuration options for the extraction
461 * @returns Promise resolving to an array of extracted data objects
462 *
463 * @throws {Error} When API key is invalid
464 * @throws {Error} When URL is invalid
465 * @throws {Error} When request times out
466 * @throws {Error} When rate limit is exceeded (after retries)
467 * @throws {Error} When no data is found
468 *
469 * @example
470 * ```typescript
471 * // Basic usage with attribute record
472 * const results = await parsera.extract({
473 * url: "https://example.com/products",
474 * attributes: {
475 * title: "Extract the product title",
476 * price: "Get the product price",
477 * }
478 * });
479 *
480 * // Advanced usage with all options
481 * const results = await parsera.extract({
482 * url: "https://example.com/products",
483 * attributes: [
484 * { name: "title", description: "Extract the product title" },
485 * { name: "price", description: "Get the product price" }
486 * ],
487 * proxyCountry: "GB",
488 * cookies: [
489 * { name: "session", value: "abc123", sameSite: "Lax" }
490 * ],
491 * precisionMode: true,
492 * signal: abortController.signal
493 * });
494 *
495 * // With request cancellation
496 * const controller = new AbortController();
497 * const promise = parsera.extract({
498 * url: "https://example.com",
499 * attributes: { title: "Extract the title" },
500 * signal: controller.signal
501 * });
502 *
503 * // Cancel the request after 5 seconds
504 * setTimeout(() => controller.abort(), 5000);
505 * ```
506 *
507 * @example
508 * // Example return value:
509 * [
510 * {
511 * "title": "Product Name",
512 * "price": "$99.99"
513 * },
514 * {
515 * "title": "Another Product",
516 * "price": "$149.99"
517 * }
518 * ]
519 */
520 async extract({
521 url,
522 attributes,
523 proxyCountry,
524 cookies,
525 precisionMode,
526 signal,
527 }: ExtractOptions): Promise<Record<string, string>[]> {
528 await this.emit("extract:start", {
529 url,
530 attributes,
531 proxyCountry,
532 cookies,
533 precisionMode,
534 signal,
535 });
536
537 this.validateUrl(url);
538
539 try {
540 const requestBody: ParseraRequestBody = {
541 url,
542 attributes: Array.isArray(attributes)
543 ? attributes
544 : this.convertToAttributes(attributes),
545 proxy_country: proxyCountry || this.defaultProxyCountry,
546 };
547
548 if (cookies) {
549 requestBody.cookies = cookies;
550 }
551
552 if (precisionMode) {
553 requestBody.mode = "precision";
554 }
555
556 const response = await this.retryableRequest(() =>
557 this.fetchWithTimeout(`${this.baseUrl}/extract`, {
558 method: "POST",
559 headers: {
560 "Content-Type": "application/json",
561 "X-API-KEY": this.apiKey,
562 },
563 body: JSON.stringify(requestBody),
564 signal,
565 })
566 );
567
568 if (!response.ok) {
569 await this.handleError(response);
570 }
571
572 const data = (await response.json()) as ParseraResponse;
573 if (!data.data?.length) {
574 throw new Error(
575 data.message ||
576 "No data returned from Parsera API. Make sure the website contains the data and the attribute descriptions are clear."
577 );
578 }
579
580 await this.emit("extract:complete", data);
581 return data.data;
582 } catch (error) {
583 if (error instanceof Error) {
584 await this.emit("extract:error", undefined, error);
585 throw new Error(`Failed to extract data: ${error.message}`);
586 }
587 throw new Error("Failed to extract data: Unknown error");
588 }
589 }
590
591 /**
592 * Alias for extract method to match Python library interface.
593 *
594 * @see {@link extract} for full documentation and examples
595 *
596 * @example
597 * ```typescript
598 * const results = await parsera.run({
599 * url: "https://example.com",
600 * attributes: { title: "Extract the title" }
601 * });
602 * ```
603 */
604 async run(options: ExtractOptions): Promise<Record<string, string>[]> {
605 return this.extract(options);
606 }
607
608 /**
609 * Alias for extract method to match Python library interface.
610 *
611 * @see {@link extract} for full documentation and examples
612 *
613 * @example
614 * ```typescript
615 * const results = await parsera.arun({
616 * url: "https://example.com",
617 * attributes: { title: "Extract the title" }
618 * });
619 * ```
620 */
621 async arun(options: ExtractOptions): Promise<Record<string, string>[]> {
622 return this.extract(options);
623 }
624
625 private async handleError(response: Response): Promise<never> {
626 const status = response.status;
627 const errorData = (await response.json()) as ParseraError;
628
629 switch (status) {
630 case 401:
631 throw new Error(
632 "Invalid Parsera API key. Please check your credentials."
633 );
634 case 429:
635 throw new Error("Rate limit exceeded. Please try again later.");
636 case 400:
637 throw new Error(
638 `Bad request: ${errorData?.message || "Unknown error"}`
639 );
640 default:
641 throw new Error(
642 `Parsera API error: ${
643 errorData?.message || response.statusText
644 }`
645 );
646 }
647 }
648}
src/types/parsera.ts
1export interface BaseInput {
2 url: string;
3 apiKey: string;
4 attributes: {
5 name: string;
6 description: string;
7 }[];
8 proxyCountry?: string;
9 cookies?: {
10 [key: string]: string;
11 sameSite: "None" | "Lax" | "Strict";
12 }[];
13 precisionMode?: boolean;
14}
15
16export interface ParseraResponse {
17 /** Array of extracted data objects */
18 data: Record<string, string>[];
19 /** Message from the API */
20 message?: string;
21}
22
23export interface ParseraError {
24 /** Error message from the API */
25 message: string;
26 /** Error code for identifying the type of error */
27 code: string;
28}
29
30export interface ParseraRequestBody {
31 /** Target URL to extract data from */
32 url: string;
33 /** Array of attributes to extract */
34 attributes: {
35 name: string;
36 description: string;
37 }[];
38 /** Country for proxy IP */
39 proxy_country?: string;
40 /** Cookies to inject into the request */
41 cookies?: {
42 [key: string]: string;
43 sameSite: "None" | "Lax" | "Strict";
44 }[];
45 /** Extraction mode: "standard" or "precision" */
46 mode?: "standard" | "precision";
47}
src/utils/validation.test.ts
1import { describe, it, expect, vi, beforeEach } from "vitest";
2import { validateInput } from "./validation.js";
3import { createInputSchema } from "../schemas/input.js";
4import { z } from "zod";
5
6vi.mock("../schemas/input.js", () => ({
7 createInputSchema: vi.fn(),
8}));
9
10const createMockSchema = () => {
11 const CookieSchema = z
12 .object({
13 sameSite: z.enum(["None", "Lax", "Strict"]),
14 })
15 .catchall(z.string());
16
17 return z.object({
18 url: z.string().url(),
19 apiKey: z.string().min(1, "API key must not be empty"),
20 attributes: z
21 .array(
22 z.object({
23 name: z.string().min(1, "Name must not be empty"),
24 description: z
25 .string()
26 .min(1, "Description must not be empty"),
27 })
28 )
29 .min(1, "At least one attribute is required"),
30 proxyCountry: z.string().optional(),
31 cookies: z.array(CookieSchema).optional(),
32 precisionMode: z.boolean().optional(),
33 });
34};
35
36describe("validateInput", () => {
37 beforeEach(() => {
38 vi.clearAllMocks();
39 });
40
41 it("should validate correct minimal input successfully", async () => {
42 const validInput = {
43 url: "https://example.com",
44 apiKey: "test-api-key",
45 attributes: [
46 {
47 name: "title",
48 description: "The title of the page",
49 },
50 ],
51 };
52
53 const mockSchema = createMockSchema();
54 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
55
56 const result = await validateInput(validInput);
57 expect(result).toEqual(validInput);
58 });
59
60 it("should validate input with all optional fields", async () => {
61 const validInput = {
62 url: "https://example.com",
63 apiKey: "test-api-key",
64 attributes: [
65 {
66 name: "title",
67 description: "The title of the page",
68 },
69 ],
70 proxyCountry: "us",
71 cookies: [{ sameSite: "Lax", name: "test", value: "value" }],
72 precisionMode: true,
73 };
74
75 const mockSchema = createMockSchema();
76 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
77
78 const result = await validateInput(validInput);
79 expect(result).toEqual(validInput);
80 });
81
82 it("should throw error for missing required fields", async () => {
83 const invalidInput = {
84 url: "https://example.com",
85 // missing apiKey
86 attributes: [],
87 };
88
89 const mockSchema = createMockSchema();
90 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
91
92 await expect(validateInput(invalidInput)).rejects.toThrow();
93 });
94
95 it("should throw error for invalid URL format", async () => {
96 const invalidInput = {
97 url: "not-a-url",
98 apiKey: "test-api-key",
99 attributes: [
100 {
101 name: "title",
102 description: "The title of the page",
103 },
104 ],
105 };
106
107 const mockSchema = createMockSchema();
108 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
109
110 await expect(validateInput(invalidInput)).rejects.toThrow();
111 });
112
113 it("should throw error with formatted message for empty attributes array", async () => {
114 const invalidInput = {
115 url: "https://example.com",
116 apiKey: "test-api-key",
117 attributes: [],
118 };
119
120 const mockSchema = createMockSchema();
121 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
122
123 await expect(async () => {
124 await validateInput(invalidInput);
125 }).rejects.toThrow("Input validation failed");
126
127 await expect(async () => {
128 await validateInput(invalidInput);
129 }).rejects.toThrow("At least one attribute is required");
130 });
131
132 it("should throw error with formatted message for empty strings", async () => {
133 const invalidInput = {
134 url: "https://example.com",
135 apiKey: "test-api-key",
136 attributes: [{ name: "", description: "" }],
137 };
138
139 const mockSchema = createMockSchema();
140 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
141
142 await expect(async () => {
143 await validateInput(invalidInput);
144 }).rejects.toThrow("Input validation failed");
145
146 await expect(async () => {
147 await validateInput(invalidInput);
148 }).rejects.toThrow("Name must not be empty");
149
150 await expect(async () => {
151 await validateInput(invalidInput);
152 }).rejects.toThrow("Description must not be empty");
153 });
154
155 it("should validate input with invalid cookie format", async () => {
156 const invalidInput = {
157 url: "https://example.com",
158 apiKey: "test-api-key",
159 attributes: [{ name: "title", description: "test" }],
160 cookies: [{ sameSite: "Invalid" }], // Invalid sameSite value
161 };
162
163 const mockSchema = createMockSchema();
164 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
165
166 await expect(validateInput(invalidInput)).rejects.toThrow();
167 });
168
169 it("should handle multiple attributes correctly", async () => {
170 const validInput = {
171 url: "https://example.com",
172 apiKey: "test-api-key",
173 attributes: [
174 {
175 name: "title",
176 description: "The title of the page",
177 },
178 {
179 name: "price",
180 description: "Product price",
181 },
182 {
183 name: "description",
184 description: "Product description",
185 },
186 ],
187 };
188
189 const mockSchema = createMockSchema();
190 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
191
192 const result = await validateInput(validInput);
193 expect(result).toEqual(validInput);
194 });
195
196 it("should handle multiple cookies correctly", async () => {
197 const validInput = {
198 url: "https://example.com",
199 apiKey: "test-api-key",
200 attributes: [{ name: "title", description: "test" }],
201 cookies: [
202 { sameSite: "Lax", name: "session", value: "123" },
203 { sameSite: "Strict", name: "preference", value: "dark" },
204 { sameSite: "None", name: "tracking", value: "allowed" },
205 ],
206 };
207
208 const mockSchema = createMockSchema();
209 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
210
211 const result = await validateInput(validInput);
212 expect(result).toEqual(validInput);
213 });
214
215 it("should validate extremely long URLs", async () => {
216 const longUrl = "https://example.com/" + "a".repeat(2000);
217 const validInput = {
218 url: longUrl,
219 apiKey: "test-api-key",
220 attributes: [{ name: "title", description: "test" }],
221 };
222
223 const mockSchema = createMockSchema();
224 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
225
226 const result = await validateInput(validInput);
227 expect(result).toEqual(validInput);
228 });
229
230 it("should handle special characters in API key", async () => {
231 const validInput = {
232 url: "https://example.com",
233 apiKey: "test-api-key!@#$%^&*()_+-=[]{}|;:,.<>?",
234 attributes: [{ name: "title", description: "test" }],
235 };
236
237 const mockSchema = createMockSchema();
238 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
239
240 const result = await validateInput(validInput);
241 expect(result).toEqual(validInput);
242 });
243
244 it("should handle unicode characters in attributes", async () => {
245 const validInput = {
246 url: "https://example.com",
247 apiKey: "test-api-key",
248 attributes: [
249 {
250 name: "título",
251 description: "描述 - 説明 - 설명",
252 },
253 ],
254 };
255
256 const mockSchema = createMockSchema();
257 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
258
259 const result = await validateInput(validInput);
260 expect(result).toEqual(validInput);
261 });
262
263 it("should validate all proxy country options", async () => {
264 const validInputs = [
265 {
266 url: "https://example.com",
267 apiKey: "test-api-key",
268 attributes: [{ name: "title", description: "test" }],
269 proxyCountry: "us",
270 },
271 {
272 url: "https://example.com",
273 apiKey: "test-api-key",
274 attributes: [{ name: "title", description: "test" }],
275 proxyCountry: "random",
276 },
277 ];
278
279 const mockSchema = createMockSchema();
280 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
281
282 for (const input of validInputs) {
283 const result = await validateInput(input);
284 expect(result).toEqual(input);
285 }
286 });
287
288 it("should handle boolean precision mode values correctly", async () => {
289 const validInputs = [
290 {
291 url: "https://example.com",
292 apiKey: "test-api-key",
293 attributes: [{ name: "title", description: "test" }],
294 precisionMode: true,
295 },
296 {
297 url: "https://example.com",
298 apiKey: "test-api-key",
299 attributes: [{ name: "title", description: "test" }],
300 precisionMode: false,
301 },
302 ];
303
304 const mockSchema = createMockSchema();
305 vi.mocked(createInputSchema).mockResolvedValue(mockSchema);
306
307 for (const input of validInputs) {
308 const result = await validateInput(input);
309 expect(result).toEqual(input);
310 }
311 });
312});
src/utils/validation.ts
1import { z } from "zod";
2import { BaseInput } from "../types/parsera.js";
3import { createInputSchema } from "../schemas/input.js";
4
5/**
6 * Validates the input configuration for the Actor using Zod
7 * @param input - The input object to validate
8 * @throws {Error} If any validation fails with detailed error messages
9 * @returns The validated input object
10 */
11export const validateInput = async (input: unknown): Promise<BaseInput> => {
12 try {
13 const schema = await createInputSchema();
14 return schema.parse(input) as BaseInput;
15 } catch (error) {
16 if (error instanceof z.ZodError) {
17 const errorMessages = error.errors
18 .map((err) => `${err.path.join(".")}: ${err.message}`)
19 .join("\n");
20 throw new Error(`Input validation failed:\n${errorMessages}`);
21 }
22 throw error;
23 }
24};
src/main.ts
1import { Actor, log } from "apify";
2import { validateInput } from "./utils/validation.js";
3import { ExtractOptions, Parsera } from "./services/parsera.js";
4import type { ParseraResponse } from "./types/parsera.js";
5
6await Actor.init();
7
8try {
9 const input = await validateInput(await Actor.getInput());
10
11 const parsera = new Parsera({
12 apiKey: input.apiKey,
13 timeout: 60000,
14 retryOptions: {
15 maxRetries: 3,
16 backoffFactor: 2,
17 initialDelay: 1000,
18 },
19 defaultProxyCountry: input.proxyCountry || "UnitedStates",
20 });
21
22 // Set up event handlers with proper typing
23 parsera.on<ExtractOptions>("extract:start", (event) => {
24 log.info("Starting extraction process", {
25 url: event.data?.url,
26 attributes: event.data?.attributes,
27 });
28 });
29
30 parsera.on("request:retry", (event) => {
31 log.warning("Retrying request", {
32 attemptNumber: (event.retryCount ?? 0) + 1,
33 timestamp: new Date(event.timestamp).toISOString(),
34 });
35 });
36
37 parsera.on("rateLimit", () => {
38 log.warning("Rate limit hit, backing off...");
39 });
40
41 parsera.on("request:error", (event) => {
42 log.error("Request failed", {
43 error: event.error?.message,
44 timestamp: new Date(event.timestamp).toISOString(),
45 });
46 });
47
48 parsera.on<ParseraResponse>("extract:complete", (event) => {
49 log.info("Extraction completed successfully", {
50 itemCount: event.data?.data?.length ?? 0,
51 timestamp: new Date(event.timestamp).toISOString(),
52 });
53 });
54
55 parsera.on("timeout", (event) => {
56 log.warning("Request timed out", {
57 error: event.error?.message,
58 timestamp: new Date(event.timestamp).toISOString(),
59 });
60 });
61
62 // Set up global timeout
63 const controller = new AbortController();
64 const timeout = setTimeout(() => {
65 controller.abort();
66 log.warning("Extraction timed out after 5 minutes");
67 }, 5 * 60 * 1000);
68
69 try {
70 const extractedData = await parsera.extract({
71 ...input,
72 signal: controller.signal,
73 precisionMode: false, // Default to false
74 });
75
76 await Actor.pushData(extractedData);
77
78 log.info("Actor finished successfully", {
79 extractedItemCount: extractedData.length,
80 lastRunAt: new Date().toISOString(),
81 });
82 } finally {
83 clearTimeout(timeout);
84 parsera.removeAllListeners(); // Clean up event listeners
85 }
86} catch (error) {
87 if (error instanceof Error && error.name === "AbortError") {
88 log.error("Extraction was aborted", {
89 error: error.message,
90 timestamp: new Date().toISOString(),
91 });
92 } else {
93 log.error("Actor failed", {
94 error: error instanceof Error ? error.message : "Unknown error",
95 timestamp: new Date().toISOString(),
96 });
97 }
98 throw error;
99} finally {
100 await Actor.exit();
101}
.dockerignore
1# configurations
2.idea
3.vscode
4
5# crawlee and apify storage folders
6apify_storage
7crawlee_storage
8storage
9
10# installed files
11node_modules
12
13# git folder
14.git
15
16# dist folder
17dist
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "root": true,
3 "env": {
4 "browser": true,
5 "es2020": true,
6 "node": true
7 },
8 "extends": [
9 "@apify/eslint-config-ts"
10 ],
11 "parserOptions": {
12 "project": "./tsconfig.json",
13 "ecmaVersion": 2020
14 },
15 "ignorePatterns": [
16 "node_modules",
17 "dist",
18 "**/*.d.ts"
19 ]
20}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4.vscode
5storage
6apify_storage
7crawlee_storage
8node_modules
9dist
10tsconfig.tsbuildinfo
11storage/*
12!storage/key_value_stores
13storage/key_value_stores/*
14!storage/key_value_stores/default
15storage/key_value_stores/default/*
16!storage/key_value_stores/default/INPUT.json
17
18# Added by Apify CLI
19.venv
package.json
1{
2 "name": "apify-actor-parsera",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "Extract data from any webpage with AI using Parsera",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.2.6",
11 "cheerio": "^1.0.0-rc.12",
12 "zod": "^3.23.8"
13 },
14 "devDependencies": {
15 "@apify/eslint-config-ts": "^0.3.0",
16 "@apify/tsconfig": "^0.1.0",
17 "@typescript-eslint/eslint-plugin": "^7.18.0",
18 "@typescript-eslint/parser": "^7.18.0",
19 "eslint": "^8.50.0",
20 "tsx": "^4.6.2",
21 "typescript": "^5.3.3",
22 "vitest": "^2.1.6"
23 },
24 "scripts": {
25 "start": "npm run start:dev",
26 "start:prod": "node dist/main.js",
27 "start:dev": "tsx src/main.ts",
28 "build": "tsc",
29 "test": "vitest run",
30 "test:watch": "vitest",
31 "test:coverage": "vitest run --coverage"
32 },
33 "author": "Doug Silkstone <doug@parsera.org>",
34 "license": "ISC"
35}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "NodeNext",
5 "moduleResolution": "NodeNext",
6 "target": "ES2022",
7 "outDir": "dist",
8 "noUnusedLocals": false,
9 "skipLibCheck": true,
10 "lib": ["DOM"]
11 },
12 "include": [
13 "./src/**/*"
14 ]
15}
vitest.config.ts
1import { defineConfig } from "vitest/config";
2
3export default defineConfig({
4 test: {
5 globals: true,
6 environment: "node",
7 include: ["**/*.test.ts"],
8 coverage: {
9 provider: "v8",
10 reporter: ["text", "json", "html"],
11 },
12 },
13});
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
1 star
Created in Nov 2024
Modified 18 hours ago