Baidu Scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsBaidu Scraper
jan.turon/baidu-scraper
This actor allows you automatically crawl the result page for your keyword for the Baidu Search engine
.DS_Store
Download.eslintrc
1{
2 "extends": "@apify"
3}
.gitignore
1apify_storage
2node_modules
3.idea
4dist
5memory_storage
6crawlee_storage
Dockerfile
1FROM apify/actor-node-playwright-chrome:16
2
3RUN npm -v
4COPY . .
5
6RUN npm i
7RUN npx playwright install
8RUN npm run build
INPUT_SCHEMA.json
1{
2 "title": "Baidu Scraper",
3 "description": "Scraper for Chinese Search Engine Baidu",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "searchPhrases": {
8 "title": "Search Phrases",
9 "type": "array",
10 "description": "Phrases to search on Baidu",
11 "prefill": ["Apify"],
12 "editor": "stringList"
13 },
14 "pages": {
15 "title": "Pages",
16 "type": "integer",
17 "description": "Number of pages to search",
18 "editor": "number"
19 }
20 },
21 "required": ["searchPhrases"]
22}
apify.json.deprecated
1{
2 "name": "baidu-scraper",
3 "version": "0.0",
4 "buildTag": "latest",
5 "env": null
6}
baidu-scraper.zip
Downloadpackage.json
1{
2 "name": "baidu-scraper",
3 "version": "0.0.1",
4 "description": "Apify scraper for Baidu search engine",
5 "main": "dist/main.js",
6 "scripts": {
7 "build": "tsc -p tsconfig.json && tsc-alias",
8 "start": "node --experimental-specifier-resolution=node dist/main.js",
9 "buildAndRun": "rm -rf dist crawlee_storage && npm run build && npm run start",
10 "publish": "npm run build && apify push"
11 },
12 "keywords": [
13 "apify"
14 ],
15 "author": "Jan Turoň",
16 "license": "ISC",
17 "dependencies": {
18 "@apify/tsconfig": "^0.1.0",
19 "@crawlee/playwright": "^3.0.3",
20 "apify": "^3.0.2",
21 "axios": "^0.27.2",
22 "cheerio": "^1.0.0-rc.12",
23 "crawlee": "^3.0.3",
24 "playwright": "^1.25.0",
25 "tsc-alias": "^1.7.0",
26 "typescript": "^4.7.4"
27 },
28 "type": "module",
29 "devDependencies": {
30 "@types/node": "^17.0.41",
31 "apify-cli": "^0.7.4",
32 "husky": "^8.0.1",
33 "prettier": "^2.7.1"
34 },
35 "optionalDependencies": {
36 "fsevents": "^2.3.2"
37 },
38 "engines": {
39 "node": ">=16"
40 }
41}
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "ES2022",
5 "sourceMap": false,
6 "declaration": false,
7 "declarationMap": false,
8 "target": "ES2022",
9 "outDir": "dist",
10 "lib": ["DOM"],
11 "baseUrl": "src",
12 "paths": { "@/*": ["./*"] }
13 },
14 "include": ["src"]
15}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "baidu-scraper",
4 "version": "0.0",
5 "buildTag": "latest"
6}
.husky/pre-commit
1#!/usr/bin/env sh
2. "$(dirname -- "$0")/_/husky.sh"
src/main.ts
1import { resolveBaiduLink, resolveResultsCount } from "@/utils";
2import { KeyValueStore, Dataset, Actor } from "apify";
3import { PlaywrightCrawler } from "crawlee";
4
5interface Link {
6 url: string;
7 title: string;
8}
9
10interface ResultLink extends Link {
11 description: string;
12}
13
14interface DataResults {
15 results: ResultLink[];
16 hotSearchResults?: Link[];
17 relatedSearchKeywords?: string[];
18 similarSearchKeywords?: string[];
19 resultsCount?: number;
20}
21
22interface Input {
23 searchPhrases: string[];
24 pages?: number;
25}
26
27await Actor.init();
28
29const crawler = new PlaywrightCrawler({
30 requestHandler: async ({ page }) => {
31 const resultElements = await page.$$("h3");
32 const results: ResultLink[] = [];
33 for (const titleElement of resultElements) {
34 const title = (await titleElement.textContent()) || "";
35 const linkElement = await titleElement.$("a");
36 const linkUrl = await linkElement?.getAttribute("href");
37
38 const url = linkUrl ? await resolveBaiduLink(linkUrl) : "";
39
40 results.push({ title, description: linkUrl!, url });
41 }
42
43 const hotSearchResultElements = await page.$$("div[class^='toplist1-tr']");
44 const hotSearchResults: Link[] = [];
45 for (const hotSearchElement of hotSearchResultElements.slice(0, 16)) {
46 const title = (await hotSearchElement.textContent()) || "";
47 const linkElement = await hotSearchElement.$("a");
48 const url = (await linkElement?.getAttribute("href")) || "";
49
50 hotSearchResults.push({ title, url });
51 }
52
53 const relatedKeywordsElements = await page.$$(
54 "div[tpl='recommend_list'] .c-gap-top-xsmall"
55 );
56 const relatedSearchKeywords: string[] = [];
57 for (const relatedKeywordsElement of relatedKeywordsElements) {
58 const title = (await relatedKeywordsElement.textContent()) || "";
59 relatedSearchKeywords.push(title);
60 }
61
62 const similarKeywordsElements = await page.$$("a[class^='rs-link']");
63 const similarSearchKeywords: string[] = [];
64 for (const similarKeywordsElement of similarKeywordsElements) {
65 const title = (await similarKeywordsElement.getAttribute("title")) || "";
66 similarSearchKeywords.push(title);
67 }
68
69 const resultsCountElement = await page.$("span[class^='hint']");
70 const resultsCount = resolveResultsCount(
71 (await resultsCountElement?.textContent()) || ""
72 );
73
74 const dataStructure: DataResults = {
75 results,
76 hotSearchResults,
77 resultsCount,
78 similarSearchKeywords,
79 relatedSearchKeywords,
80 };
81
82 await Dataset.pushData([dataStructure]);
83 },
84});
85
86const { searchPhrases = ["apify", "nic"], pages = 1 } =
87 (await KeyValueStore.getInput<Input>()) ?? {};
88
89const pageArray = [...Array(pages).keys()];
90
91const requests = searchPhrases.flatMap((searchPhrase) =>
92 pageArray.map((pageNr) => {
93 const lookoutQuery = new URLSearchParams([
94 ["pn", pageNr.toString()],
95 ["wd", searchPhrase],
96 ]);
97 return `https://www.baidu.com/s?${lookoutQuery.toString()}`;
98 })
99);
100
101await crawler.addRequests(requests);
102await crawler.run();
103
104await Actor.exit();
src/utils.ts
1import axios from "axios";
2import cheerio from "cheerio";
3
4/**
5 * There's 2 discovered baidu links behaviors
6 * - redirect via server response 302,
7 * - timed-out custom redirect with custom HTML (status 200)
8 * @param url Baidu redirect url. Something like: http://www.baidu.com/link?url=someBasey64Chars
9 */
10export const resolveBaiduLink = async (url: string): Promise<string> => {
11 const { data, headers } = await axios.get<string>(url, {
12 maxRedirects: 0,
13 validateStatus: (status) => [302, 200].includes(status), // Baidu sometimes returns redirect 302 which leads to Axios error.
14 });
15
16 // Redirect location is enough when available (for 302 response)
17 if (headers.location) return headers.location;
18
19 // Baidu sometimes returns 200 with custom html - address is available in 'noscript' tag
20 const cheerioSelector = cheerio.load(data);
21
22 const noScriptElementContent = cheerioSelector("noscript").html();
23
24 if (!noScriptElementContent) return "";
25
26 const fullUrl = /(')(?:(?=(\\?))\2.)*?\1/.exec(noScriptElementContent);
27
28 return fullUrl?.[1]!;
29};
30
31/**
32 * Resolves results count string to number
33 * @param countText span tag text above first Baidu result, looks like '百度为您找到相关结果约740,000个'
34 */
35export const resolveResultsCount = (countText: string): number => {
36 const digitMatches = [...countText.matchAll(/\d+/g)]; // Thousands are comma-separated, matches will be in groups - ['740', '000']
37
38 const digitString = digitMatches.reduce(
39 (digitString, digitMatch) => digitString + digitMatch,
40 ""
41 );
42
43 return parseInt(digitString);
44};
Developer
Maintained by Community
Categories