BeautifulSoup Scraper avatar

BeautifulSoup Scraper

Try for free

No credit card required

Go to Store
BeautifulSoup Scraper

BeautifulSoup Scraper

apify/beautifulsoup-scraper
Try for free

No credit card required

Crawls websites using raw HTTP requests. It parses the HTML with the BeautifulSoup library and extracts data from the pages using Python code. Supports both recursive crawling and lists of URLs. This Actor is a Python alternative to Cheerio Scraper.

You can access the BeautifulSoup Scraper programmatically from your own applications by using the Apify API. You can choose the language preference from below. To use the Apify API, you’ll need an Apify account and your API token, found in Integrations settings in Apify Console.

1{
2  "openapi": "3.0.1",
3  "info": {
4    "version": "0.1",
5    "x-build-id": "3vsrJE15Of3EeHHKg"
6  },
7  "servers": [
8    {
9      "url": "https://api.apify.com/v2"
10    }
11  ],
12  "paths": {
13    "/acts/apify~beautifulsoup-scraper/run-sync-get-dataset-items": {
14      "post": {
15        "operationId": "run-sync-get-dataset-items-apify-beautifulsoup-scraper",
16        "x-openai-isConsequential": false,
17        "summary": "Executes an Actor, waits for its completion, and returns Actor's dataset items in response.",
18        "tags": [
19          "Run Actor"
20        ],
21        "requestBody": {
22          "required": true,
23          "content": {
24            "application/json": {
25              "schema": {
26                "$ref": "#/components/schemas/inputSchema"
27              }
28            }
29          }
30        },
31        "parameters": [
32          {
33            "name": "token",
34            "in": "query",
35            "required": true,
36            "schema": {
37              "type": "string"
38            },
39            "description": "Enter your Apify token here"
40          }
41        ],
42        "responses": {
43          "200": {
44            "description": "OK"
45          }
46        }
47      }
48    },
49    "/acts/apify~beautifulsoup-scraper/runs": {
50      "post": {
51        "operationId": "runs-sync-apify-beautifulsoup-scraper",
52        "x-openai-isConsequential": false,
53        "summary": "Executes an Actor and returns information about the initiated run in response.",
54        "tags": [
55          "Run Actor"
56        ],
57        "requestBody": {
58          "required": true,
59          "content": {
60            "application/json": {
61              "schema": {
62                "$ref": "#/components/schemas/inputSchema"
63              }
64            }
65          }
66        },
67        "parameters": [
68          {
69            "name": "token",
70            "in": "query",
71            "required": true,
72            "schema": {
73              "type": "string"
74            },
75            "description": "Enter your Apify token here"
76          }
77        ],
78        "responses": {
79          "200": {
80            "description": "OK",
81            "content": {
82              "application/json": {
83                "schema": {
84                  "$ref": "#/components/schemas/runsResponseSchema"
85                }
86              }
87            }
88          }
89        }
90      }
91    },
92    "/acts/apify~beautifulsoup-scraper/run-sync": {
93      "post": {
94        "operationId": "run-sync-apify-beautifulsoup-scraper",
95        "x-openai-isConsequential": false,
96        "summary": "Executes an Actor, waits for completion, and returns the OUTPUT from Key-value store in response.",
97        "tags": [
98          "Run Actor"
99        ],
100        "requestBody": {
101          "required": true,
102          "content": {
103            "application/json": {
104              "schema": {
105                "$ref": "#/components/schemas/inputSchema"
106              }
107            }
108          }
109        },
110        "parameters": [
111          {
112            "name": "token",
113            "in": "query",
114            "required": true,
115            "schema": {
116              "type": "string"
117            },
118            "description": "Enter your Apify token here"
119          }
120        ],
121        "responses": {
122          "200": {
123            "description": "OK"
124          }
125        }
126      }
127    }
128  },
129  "components": {
130    "schemas": {
131      "inputSchema": {
132        "type": "object",
133        "required": [
134          "startUrls",
135          "pageFunction",
136          "proxyConfiguration"
137        ],
138        "properties": {
139          "startUrls": {
140            "title": "Start URLs",
141            "type": "array",
142            "description": "A static list of URLs to scrape.",
143            "items": {
144              "type": "object",
145              "required": [
146                "url"
147              ],
148              "properties": {
149                "url": {
150                  "type": "string",
151                  "title": "URL of a web page",
152                  "format": "uri"
153                }
154              }
155            }
156          },
157          "maxCrawlingDepth": {
158            "title": "Max crawling depth",
159            "minimum": 0,
160            "type": "integer",
161            "description": "Specifies how many links away from the <b>Start URLs</b> the scraper will descend. Note that pages added using <code>context.request_queue</code> in <b>Page function</b> are not subject to the maximum depth constraint.",
162            "default": 1
163          },
164          "requestTimeout": {
165            "title": "Request timeout",
166            "minimum": 0,
167            "type": "integer",
168            "description": "The maximum duration (in seconds) for the request to complete before timing out. The timeout value is passed to the <code>httpx.AsyncClient</code> object.",
169            "default": 10
170          },
171          "linkSelector": {
172            "title": "Link selector",
173            "type": "string",
174            "description": "A CSS selector stating which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) shall be followed and added to the request queue. To filter the links added to the queue, use the <b>Link patterns</b> field.<br><br>If the <b>Link selector</b> is empty, the page links are ignored. Of course, you can work with the page links and the request queue in the <b>Page function</b> as well."
175          },
176          "linkPatterns": {
177            "title": "Link patterns",
178            "type": "array",
179            "description": "Link patterns (regular expressions) to match links in the page that you want to enqueue. Combine with <b>Link selector</b> to tell the scraper where to find links. Omitting the link patterns will cause the scraper to enqueue all links matched by the Link selector.",
180            "items": {
181              "type": "string"
182            }
183          },
184          "pageFunction": {
185            "title": "Page function",
186            "type": "string",
187            "description": "A Python function, that is executed for every page. Use it to scrape data from the page, perform actions or add new URLs to the request queue. The page function has its own naming scope and you can import any installed modules. Typically you would want to obtain the data from the <code>context.soup</code> object and return them. Identifier <code>page_function</code> can't be changed. For more information about the <code>context</code> object you get into the <code>page_function</code> check the <a href='https://github.com/apify/actor-beautifulsoup-scraper#context' target='_blank' rel='noopener'>github.com/apify/actor-beautifulsoup-scraper#context</a>. Asynchronous functions are supported."
188          },
189          "soupFeatures": {
190            "title": "BeautifulSoup features",
191            "type": "string",
192            "description": "The value of BeautifulSoup <code>features</code> argument. From BeautifulSoup docs: Desirable features of the parser to be used. This may be the name of a specific parser (\"lxml\", \"lxml-xml\", \"html.parser\", or \"html5lib\") or it may be the type of markup to be used (\"html\", \"html5\", \"xml\"). It's recommended that you name a specific parser, so that Beautiful Soup gives you the same results across platforms and virtual environments."
193          },
194          "soupFromEncoding": {
195            "title": "BeautifulSoup from_encoding",
196            "type": "string",
197            "description": "The value of BeautifulSoup <code>from_encoding</code> argument. From BeautifulSoup docs: A string indicating the encoding of the document to be parsed. Pass this in if Beautiful Soup is guessing wrongly about the document's encoding."
198          },
199          "soupExcludeEncodings": {
200            "title": "BeautifulSoup exclude_encodings",
201            "type": "array",
202            "description": "The value of BeautifulSoup <code>exclude_encodings</code> argument. From BeautifulSoup docs: A list of strings indicating encodings known to be wrong. Pass this in if you don't know the document's encoding but you know Beautiful Soup's guess is wrong.",
203            "items": {
204              "type": "string"
205            }
206          },
207          "proxyConfiguration": {
208            "title": "Proxy configuration",
209            "type": "object",
210            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
211            "default": {
212              "useApifyProxy": true
213            }
214          }
215        }
216      },
217      "runsResponseSchema": {
218        "type": "object",
219        "properties": {
220          "data": {
221            "type": "object",
222            "properties": {
223              "id": {
224                "type": "string"
225              },
226              "actId": {
227                "type": "string"
228              },
229              "userId": {
230                "type": "string"
231              },
232              "startedAt": {
233                "type": "string",
234                "format": "date-time",
235                "example": "2025-01-08T00:00:00.000Z"
236              },
237              "finishedAt": {
238                "type": "string",
239                "format": "date-time",
240                "example": "2025-01-08T00:00:00.000Z"
241              },
242              "status": {
243                "type": "string",
244                "example": "READY"
245              },
246              "meta": {
247                "type": "object",
248                "properties": {
249                  "origin": {
250                    "type": "string",
251                    "example": "API"
252                  },
253                  "userAgent": {
254                    "type": "string"
255                  }
256                }
257              },
258              "stats": {
259                "type": "object",
260                "properties": {
261                  "inputBodyLen": {
262                    "type": "integer",
263                    "example": 2000
264                  },
265                  "rebootCount": {
266                    "type": "integer",
267                    "example": 0
268                  },
269                  "restartCount": {
270                    "type": "integer",
271                    "example": 0
272                  },
273                  "resurrectCount": {
274                    "type": "integer",
275                    "example": 0
276                  },
277                  "computeUnits": {
278                    "type": "integer",
279                    "example": 0
280                  }
281                }
282              },
283              "options": {
284                "type": "object",
285                "properties": {
286                  "build": {
287                    "type": "string",
288                    "example": "latest"
289                  },
290                  "timeoutSecs": {
291                    "type": "integer",
292                    "example": 300
293                  },
294                  "memoryMbytes": {
295                    "type": "integer",
296                    "example": 1024
297                  },
298                  "diskMbytes": {
299                    "type": "integer",
300                    "example": 2048
301                  }
302                }
303              },
304              "buildId": {
305                "type": "string"
306              },
307              "defaultKeyValueStoreId": {
308                "type": "string"
309              },
310              "defaultDatasetId": {
311                "type": "string"
312              },
313              "defaultRequestQueueId": {
314                "type": "string"
315              },
316              "buildNumber": {
317                "type": "string",
318                "example": "1.0.0"
319              },
320              "containerUrl": {
321                "type": "string"
322              },
323              "usage": {
324                "type": "object",
325                "properties": {
326                  "ACTOR_COMPUTE_UNITS": {
327                    "type": "integer",
328                    "example": 0
329                  },
330                  "DATASET_READS": {
331                    "type": "integer",
332                    "example": 0
333                  },
334                  "DATASET_WRITES": {
335                    "type": "integer",
336                    "example": 0
337                  },
338                  "KEY_VALUE_STORE_READS": {
339                    "type": "integer",
340                    "example": 0
341                  },
342                  "KEY_VALUE_STORE_WRITES": {
343                    "type": "integer",
344                    "example": 1
345                  },
346                  "KEY_VALUE_STORE_LISTS": {
347                    "type": "integer",
348                    "example": 0
349                  },
350                  "REQUEST_QUEUE_READS": {
351                    "type": "integer",
352                    "example": 0
353                  },
354                  "REQUEST_QUEUE_WRITES": {
355                    "type": "integer",
356                    "example": 0
357                  },
358                  "DATA_TRANSFER_INTERNAL_GBYTES": {
359                    "type": "integer",
360                    "example": 0
361                  },
362                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
363                    "type": "integer",
364                    "example": 0
365                  },
366                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
367                    "type": "integer",
368                    "example": 0
369                  },
370                  "PROXY_SERPS": {
371                    "type": "integer",
372                    "example": 0
373                  }
374                }
375              },
376              "usageTotalUsd": {
377                "type": "number",
378                "example": 0.00005
379              },
380              "usageUsd": {
381                "type": "object",
382                "properties": {
383                  "ACTOR_COMPUTE_UNITS": {
384                    "type": "integer",
385                    "example": 0
386                  },
387                  "DATASET_READS": {
388                    "type": "integer",
389                    "example": 0
390                  },
391                  "DATASET_WRITES": {
392                    "type": "integer",
393                    "example": 0
394                  },
395                  "KEY_VALUE_STORE_READS": {
396                    "type": "integer",
397                    "example": 0
398                  },
399                  "KEY_VALUE_STORE_WRITES": {
400                    "type": "number",
401                    "example": 0.00005
402                  },
403                  "KEY_VALUE_STORE_LISTS": {
404                    "type": "integer",
405                    "example": 0
406                  },
407                  "REQUEST_QUEUE_READS": {
408                    "type": "integer",
409                    "example": 0
410                  },
411                  "REQUEST_QUEUE_WRITES": {
412                    "type": "integer",
413                    "example": 0
414                  },
415                  "DATA_TRANSFER_INTERNAL_GBYTES": {
416                    "type": "integer",
417                    "example": 0
418                  },
419                  "DATA_TRANSFER_EXTERNAL_GBYTES": {
420                    "type": "integer",
421                    "example": 0
422                  },
423                  "PROXY_RESIDENTIAL_TRANSFER_GBYTES": {
424                    "type": "integer",
425                    "example": 0
426                  },
427                  "PROXY_SERPS": {
428                    "type": "integer",
429                    "example": 0
430                  }
431                }
432              }
433            }
434          }
435        }
436      }
437    }
438  }
439}

BeautifulSoup Scraper OpenAPI definition

OpenAPI is a standard for designing and describing RESTful APIs, allowing developers to define API structure, endpoints, and data formats in a machine-readable way. It simplifies API development, integration, and documentation.

OpenAPI is effective when used with AI agents and GPTs by standardizing how these systems interact with various APIs, for reliable integrations and efficient communication.

By defining machine-readable API specifications, OpenAPI allows AI models like GPTs to understand and use varied data sources, improving accuracy. This accelerates development, reduces errors, and provides context-aware responses, making OpenAPI a core component for AI applications.

You can download the OpenAPI definitions for BeautifulSoup Scraper from the options below:

If you’d like to learn more about how OpenAPI powers GPTs, read our blog post.

You can also check out our other API clients:

Developer
Maintained by Apify

Actor Metrics

  • 27 monthly users

  • 5 bookmarks

  • 98% runs succeeded

  • Created in Jul 2023

  • Modified 3 months ago

Categories