Baidu Scraper
Deprecated
Pricing
Pay per usage
Go to Store
Baidu Scraper
Deprecated
This actor allows you automatically crawl the result page for your keyword for the Baidu Search engine
0.0 (0)
Pricing
Pay per usage
4
Total users
42
Monthly users
1
Last modified
a year ago
.DS_Store
Download.eslintrc
{ "extends": "@apify"}
.gitignore
apify_storagenode_modules.ideadistmemory_storagecrawlee_storage
Dockerfile
FROM apify/actor-node-playwright-chrome:16
RUN npm -vCOPY . .
RUN npm iRUN npx playwright installRUN npm run build
INPUT_SCHEMA.json
{ "title": "Baidu Scraper", "description": "Scraper for Chinese Search Engine Baidu", "type": "object", "schemaVersion": 1, "properties": { "searchPhrases": { "title": "Search Phrases", "type": "array", "description": "Phrases to search on Baidu", "prefill": ["Apify"], "editor": "stringList" }, "pages": { "title": "Pages", "type": "integer", "description": "Number of pages to search", "editor": "number" } }, "required": ["searchPhrases"]}
apify.json.deprecated
{ "name": "baidu-scraper", "version": "0.0", "buildTag": "latest", "env": null}
baidu-scraper.zip
Downloadpackage.json
{ "name": "baidu-scraper", "version": "0.0.1", "description": "Apify scraper for Baidu search engine", "main": "dist/main.js", "scripts": { "build": "tsc -p tsconfig.json && tsc-alias", "start": "node --experimental-specifier-resolution=node dist/main.js", "buildAndRun": "rm -rf dist crawlee_storage && npm run build && npm run start", "publish": "npm run build && apify push" }, "keywords": [ "apify" ], "author": "Jan Turoň", "license": "ISC", "dependencies": { "@apify/tsconfig": "^0.1.0", "@crawlee/playwright": "^3.0.3", "apify": "^3.0.2", "axios": "^0.27.2", "cheerio": "^1.0.0-rc.12", "crawlee": "^3.0.3", "playwright": "^1.25.0", "tsc-alias": "^1.7.0", "typescript": "^4.7.4" }, "type": "module", "devDependencies": { "@types/node": "^17.0.41", "apify-cli": "^0.7.4", "husky": "^8.0.1", "prettier": "^2.7.1" }, "optionalDependencies": { "fsevents": "^2.3.2" }, "engines": { "node": ">=16" }}
tsconfig.json
{ "extends": "@apify/tsconfig", "compilerOptions": { "module": "ES2022", "sourceMap": false, "declaration": false, "declarationMap": false, "target": "ES2022", "outDir": "dist", "lib": ["DOM"], "baseUrl": "src", "paths": { "@/*": ["./*"] } }, "include": ["src"]}
.actor/actor.json
{ "actorSpecification": 1, "name": "baidu-scraper", "version": "0.0", "buildTag": "latest"}
.husky/pre-commit
#!/usr/bin/env sh. "$(dirname -- "$0")/_/husky.sh"
src/main.ts
1import { resolveBaiduLink, resolveResultsCount } from "@/utils";2import { KeyValueStore, Dataset, Actor } from "apify";3import { PlaywrightCrawler } from "crawlee";4
5interface Link {6 url: string;7 title: string;8}9
10interface ResultLink extends Link {11 description: string;12}13
14interface DataResults {15 results: ResultLink[];16 hotSearchResults?: Link[];17 relatedSearchKeywords?: string[];18 similarSearchKeywords?: string[];19 resultsCount?: number;20}21
22interface Input {23 searchPhrases: string[];24 pages?: number;25}26
27await Actor.init();28
29const crawler = new PlaywrightCrawler({30 requestHandler: async ({ page }) => {31 const resultElements = await page.$$("h3");32 const results: ResultLink[] = [];33 for (const titleElement of resultElements) {34 const title = (await titleElement.textContent()) || "";35 const linkElement = await titleElement.$("a");36 const linkUrl = await linkElement?.getAttribute("href");37
38 const url = linkUrl ? await resolveBaiduLink(linkUrl) : "";39
40 results.push({ title, description: linkUrl!, url });41 }42
43 const hotSearchResultElements = await page.$$("div[class^='toplist1-tr']");44 const hotSearchResults: Link[] = [];45 for (const hotSearchElement of hotSearchResultElements.slice(0, 16)) {46 const title = (await hotSearchElement.textContent()) || "";47 const linkElement = await hotSearchElement.$("a");48 const url = (await linkElement?.getAttribute("href")) || "";49
50 hotSearchResults.push({ title, url });51 }52
53 const relatedKeywordsElements = await page.$$(54 "div[tpl='recommend_list'] .c-gap-top-xsmall"55 );56 const relatedSearchKeywords: string[] = [];57 for (const relatedKeywordsElement of relatedKeywordsElements) {58 const title = (await relatedKeywordsElement.textContent()) || "";59 relatedSearchKeywords.push(title);60 }61
62 const similarKeywordsElements = await page.$$("a[class^='rs-link']");63 const similarSearchKeywords: string[] = [];64 for (const similarKeywordsElement of similarKeywordsElements) {65 const title = (await similarKeywordsElement.getAttribute("title")) || "";66 similarSearchKeywords.push(title);67 }68
69 const resultsCountElement = await page.$("span[class^='hint']");70 const resultsCount = resolveResultsCount(71 (await resultsCountElement?.textContent()) || ""72 );73
74 const dataStructure: DataResults = {75 results,76 hotSearchResults,77 resultsCount,78 similarSearchKeywords,79 relatedSearchKeywords,80 };81
82 await Dataset.pushData([dataStructure]);83 },84});85
86const { searchPhrases = ["apify", "nic"], pages = 1 } =87 (await KeyValueStore.getInput<Input>()) ?? {};88
89const pageArray = [...Array(pages).keys()];90
91const requests = searchPhrases.flatMap((searchPhrase) =>92 pageArray.map((pageNr) => {93 const lookoutQuery = new URLSearchParams([94 ["pn", pageNr.toString()],95 ["wd", searchPhrase],96 ]);97 return `https://www.baidu.com/s?${lookoutQuery.toString()}`;98 })99);100
101await crawler.addRequests(requests);102await crawler.run();103
104await Actor.exit();
src/utils.ts
1import axios from "axios";2import cheerio from "cheerio";3
4/**5 * There's 2 discovered baidu links behaviors6 * - redirect via server response 302,7 * - timed-out custom redirect with custom HTML (status 200)8 * @param url Baidu redirect url. Something like: http://www.baidu.com/link?url=someBasey64Chars9 */10export const resolveBaiduLink = async (url: string): Promise<string> => {11 const { data, headers } = await axios.get<string>(url, {12 maxRedirects: 0,13 validateStatus: (status) => [302, 200].includes(status), // Baidu sometimes returns redirect 302 which leads to Axios error.14 });15
16 // Redirect location is enough when available (for 302 response)17 if (headers.location) return headers.location;18
19 // Baidu sometimes returns 200 with custom html - address is available in 'noscript' tag20 const cheerioSelector = cheerio.load(data);21
22 const noScriptElementContent = cheerioSelector("noscript").html();23
24 if (!noScriptElementContent) return "";25
26 const fullUrl = /(')(?:(?=(\\?))\2.)*?\1/.exec(noScriptElementContent);27
28 return fullUrl?.[1]!;29};30
31/**32 * Resolves results count string to number33 * @param countText span tag text above first Baidu result, looks like '百度为您找到相关结果约740,000个'34 */35export const resolveResultsCount = (countText: string): number => {36 const digitMatches = [...countText.matchAll(/\d+/g)]; // Thousands are comma-separated, matches will be in groups - ['740', '000']37
38 const digitString = digitMatches.reduce(39 (digitString, digitMatch) => digitString + digitMatch,40 ""41 );42
43 return parseInt(digitString);44};