Baidu Scraper avatar
Baidu Scraper

Deprecated

Pricing

Pay per usage

Go to Store
Baidu Scraper

Baidu Scraper

Deprecated

Developed by

Jan Turoň

Jan Turoň

Maintained by Community

This actor allows you automatically crawl the result page for your keyword for the Baidu Search engine

0.0 (0)

Pricing

Pay per usage

4

Total users

42

Monthly users

1

Last modified

a year ago

.DS_Store

Download

.eslintrc

{
"extends": "@apify"
}

.gitignore

apify_storage
node_modules
.idea
dist
memory_storage
crawlee_storage

Dockerfile

FROM apify/actor-node-playwright-chrome:16
RUN npm -v
COPY . .
RUN npm i
RUN npx playwright install
RUN npm run build

INPUT_SCHEMA.json

{
"title": "Baidu Scraper",
"description": "Scraper for Chinese Search Engine Baidu",
"type": "object",
"schemaVersion": 1,
"properties": {
"searchPhrases": {
"title": "Search Phrases",
"type": "array",
"description": "Phrases to search on Baidu",
"prefill": ["Apify"],
"editor": "stringList"
},
"pages": {
"title": "Pages",
"type": "integer",
"description": "Number of pages to search",
"editor": "number"
}
},
"required": ["searchPhrases"]
}

apify.json.deprecated

{
"name": "baidu-scraper",
"version": "0.0",
"buildTag": "latest",
"env": null
}

baidu-scraper.zip

Download

package.json

{
"name": "baidu-scraper",
"version": "0.0.1",
"description": "Apify scraper for Baidu search engine",
"main": "dist/main.js",
"scripts": {
"build": "tsc -p tsconfig.json && tsc-alias",
"start": "node --experimental-specifier-resolution=node dist/main.js",
"buildAndRun": "rm -rf dist crawlee_storage && npm run build && npm run start",
"publish": "npm run build && apify push"
},
"keywords": [
"apify"
],
"author": "Jan Turoň",
"license": "ISC",
"dependencies": {
"@apify/tsconfig": "^0.1.0",
"@crawlee/playwright": "^3.0.3",
"apify": "^3.0.2",
"axios": "^0.27.2",
"cheerio": "^1.0.0-rc.12",
"crawlee": "^3.0.3",
"playwright": "^1.25.0",
"tsc-alias": "^1.7.0",
"typescript": "^4.7.4"
},
"type": "module",
"devDependencies": {
"@types/node": "^17.0.41",
"apify-cli": "^0.7.4",
"husky": "^8.0.1",
"prettier": "^2.7.1"
},
"optionalDependencies": {
"fsevents": "^2.3.2"
},
"engines": {
"node": ">=16"
}
}

tsconfig.json

{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "ES2022",
"sourceMap": false,
"declaration": false,
"declarationMap": false,
"target": "ES2022",
"outDir": "dist",
"lib": ["DOM"],
"baseUrl": "src",
"paths": { "@/*": ["./*"] }
},
"include": ["src"]
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "baidu-scraper",
"version": "0.0",
"buildTag": "latest"
}

.husky/pre-commit

#!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh"

src/main.ts

1import { resolveBaiduLink, resolveResultsCount } from "@/utils";
2import { KeyValueStore, Dataset, Actor } from "apify";
3import { PlaywrightCrawler } from "crawlee";
4
5interface Link {
6 url: string;
7 title: string;
8}
9
10interface ResultLink extends Link {
11 description: string;
12}
13
14interface DataResults {
15 results: ResultLink[];
16 hotSearchResults?: Link[];
17 relatedSearchKeywords?: string[];
18 similarSearchKeywords?: string[];
19 resultsCount?: number;
20}
21
22interface Input {
23 searchPhrases: string[];
24 pages?: number;
25}
26
27await Actor.init();
28
29const crawler = new PlaywrightCrawler({
30 requestHandler: async ({ page }) => {
31 const resultElements = await page.$$("h3");
32 const results: ResultLink[] = [];
33 for (const titleElement of resultElements) {
34 const title = (await titleElement.textContent()) || "";
35 const linkElement = await titleElement.$("a");
36 const linkUrl = await linkElement?.getAttribute("href");
37
38 const url = linkUrl ? await resolveBaiduLink(linkUrl) : "";
39
40 results.push({ title, description: linkUrl!, url });
41 }
42
43 const hotSearchResultElements = await page.$$("div[class^='toplist1-tr']");
44 const hotSearchResults: Link[] = [];
45 for (const hotSearchElement of hotSearchResultElements.slice(0, 16)) {
46 const title = (await hotSearchElement.textContent()) || "";
47 const linkElement = await hotSearchElement.$("a");
48 const url = (await linkElement?.getAttribute("href")) || "";
49
50 hotSearchResults.push({ title, url });
51 }
52
53 const relatedKeywordsElements = await page.$$(
54 "div[tpl='recommend_list'] .c-gap-top-xsmall"
55 );
56 const relatedSearchKeywords: string[] = [];
57 for (const relatedKeywordsElement of relatedKeywordsElements) {
58 const title = (await relatedKeywordsElement.textContent()) || "";
59 relatedSearchKeywords.push(title);
60 }
61
62 const similarKeywordsElements = await page.$$("a[class^='rs-link']");
63 const similarSearchKeywords: string[] = [];
64 for (const similarKeywordsElement of similarKeywordsElements) {
65 const title = (await similarKeywordsElement.getAttribute("title")) || "";
66 similarSearchKeywords.push(title);
67 }
68
69 const resultsCountElement = await page.$("span[class^='hint']");
70 const resultsCount = resolveResultsCount(
71 (await resultsCountElement?.textContent()) || ""
72 );
73
74 const dataStructure: DataResults = {
75 results,
76 hotSearchResults,
77 resultsCount,
78 similarSearchKeywords,
79 relatedSearchKeywords,
80 };
81
82 await Dataset.pushData([dataStructure]);
83 },
84});
85
86const { searchPhrases = ["apify", "nic"], pages = 1 } =
87 (await KeyValueStore.getInput<Input>()) ?? {};
88
89const pageArray = [...Array(pages).keys()];
90
91const requests = searchPhrases.flatMap((searchPhrase) =>
92 pageArray.map((pageNr) => {
93 const lookoutQuery = new URLSearchParams([
94 ["pn", pageNr.toString()],
95 ["wd", searchPhrase],
96 ]);
97 return `https://www.baidu.com/s?${lookoutQuery.toString()}`;
98 })
99);
100
101await crawler.addRequests(requests);
102await crawler.run();
103
104await Actor.exit();

src/utils.ts

1import axios from "axios";
2import cheerio from "cheerio";
3
4/**
5 * There's 2 discovered baidu links behaviors
6 * - redirect via server response 302,
7 * - timed-out custom redirect with custom HTML (status 200)
8 * @param url Baidu redirect url. Something like: http://www.baidu.com/link?url=someBasey64Chars
9 */
10export const resolveBaiduLink = async (url: string): Promise<string> => {
11 const { data, headers } = await axios.get<string>(url, {
12 maxRedirects: 0,
13 validateStatus: (status) => [302, 200].includes(status), // Baidu sometimes returns redirect 302 which leads to Axios error.
14 });
15
16 // Redirect location is enough when available (for 302 response)
17 if (headers.location) return headers.location;
18
19 // Baidu sometimes returns 200 with custom html - address is available in 'noscript' tag
20 const cheerioSelector = cheerio.load(data);
21
22 const noScriptElementContent = cheerioSelector("noscript").html();
23
24 if (!noScriptElementContent) return "";
25
26 const fullUrl = /(')(?:(?=(\\?))\2.)*?\1/.exec(noScriptElementContent);
27
28 return fullUrl?.[1]!;
29};
30
31/**
32 * Resolves results count string to number
33 * @param countText span tag text above first Baidu result, looks like '百度为您找到相关结果约740,000个'
34 */
35export const resolveResultsCount = (countText: string): number => {
36 const digitMatches = [...countText.matchAll(/\d+/g)]; // Thousands are comma-separated, matches will be in groups - ['740', '000']
37
38 const digitString = digitMatches.reduce(
39 (digitString, digitMatch) => digitString + digitMatch,
40 ""
41 );
42
43 return parseInt(digitString);
44};