Baidu Scraper avatar

Baidu Scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Baidu Scraper

Baidu Scraper

jan.turon/baidu-scraper

This actor allows you automatically crawl the result page for your keyword for the Baidu Search engine

.DS_Store

Download

.eslintrc

1{
2  "extends": "@apify"
3}

.gitignore

1apify_storage
2node_modules
3.idea
4dist
5memory_storage
6crawlee_storage

Dockerfile

1FROM apify/actor-node-playwright-chrome:16
2
3RUN npm -v
4COPY . .
5
6RUN npm i
7RUN npx playwright install
8RUN npm run build

INPUT_SCHEMA.json

1{
2  "title": "Baidu Scraper",
3  "description": "Scraper for Chinese Search Engine Baidu",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "searchPhrases": {
8      "title": "Search Phrases",
9      "type": "array",
10      "description": "Phrases to search on Baidu",
11      "prefill": ["Apify"],
12      "editor": "stringList"
13    },
14    "pages": {
15      "title": "Pages",
16      "type": "integer",
17      "description": "Number of pages to search",
18      "editor": "number"
19    }
20  },
21  "required": ["searchPhrases"]
22}

apify.json.deprecated

1{
2	"name": "baidu-scraper",
3	"version": "0.0",
4	"buildTag": "latest",
5	"env": null
6}

baidu-scraper.zip

Download

package.json

1{
2  "name": "baidu-scraper",
3  "version": "0.0.1",
4  "description": "Apify scraper for Baidu search engine",
5  "main": "dist/main.js",
6  "scripts": {
7    "build": "tsc -p tsconfig.json && tsc-alias",
8    "start": "node --experimental-specifier-resolution=node dist/main.js",
9    "buildAndRun": "rm -rf dist crawlee_storage && npm run build && npm run start",
10    "publish": "npm run build && apify push"
11  },
12  "keywords": [
13    "apify"
14  ],
15  "author": "Jan Turoň",
16  "license": "ISC",
17  "dependencies": {
18    "@apify/tsconfig": "^0.1.0",
19    "@crawlee/playwright": "^3.0.3",
20    "apify": "^3.0.2",
21    "axios": "^0.27.2",
22    "cheerio": "^1.0.0-rc.12",
23    "crawlee": "^3.0.3",
24    "playwright": "^1.25.0",
25    "tsc-alias": "^1.7.0",
26    "typescript": "^4.7.4"
27  },
28  "type": "module",
29  "devDependencies": {
30    "@types/node": "^17.0.41",
31    "apify-cli": "^0.7.4",
32    "husky": "^8.0.1",
33    "prettier": "^2.7.1"
34  },
35  "optionalDependencies": {
36    "fsevents": "^2.3.2"
37  },
38  "engines": {
39    "node": ">=16"
40  }
41}

tsconfig.json

1{
2  "extends": "@apify/tsconfig",
3  "compilerOptions": {
4    "module": "ES2022",
5    "sourceMap": false,
6    "declaration": false,
7    "declarationMap": false,
8    "target": "ES2022",
9    "outDir": "dist",
10    "lib": ["DOM"],
11    "baseUrl": "src",
12    "paths": { "@/*": ["./*"] }
13  },
14  "include": ["src"]
15}

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "baidu-scraper",
4	"version": "0.0",
5	"buildTag": "latest"
6}

.husky/pre-commit

1#!/usr/bin/env sh
2. "$(dirname -- "$0")/_/husky.sh"

src/main.ts

1import { resolveBaiduLink, resolveResultsCount } from "@/utils";
2import { KeyValueStore, Dataset, Actor } from "apify";
3import { PlaywrightCrawler } from "crawlee";
4
5interface Link {
6  url: string;
7  title: string;
8}
9
10interface ResultLink extends Link {
11  description: string;
12}
13
14interface DataResults {
15  results: ResultLink[];
16  hotSearchResults?: Link[];
17  relatedSearchKeywords?: string[];
18  similarSearchKeywords?: string[];
19  resultsCount?: number;
20}
21
22interface Input {
23  searchPhrases: string[];
24  pages?: number;
25}
26
27await Actor.init();
28
29const crawler = new PlaywrightCrawler({
30  requestHandler: async ({ page }) => {
31    const resultElements = await page.$$("h3");
32    const results: ResultLink[] = [];
33    for (const titleElement of resultElements) {
34      const title = (await titleElement.textContent()) || "";
35      const linkElement = await titleElement.$("a");
36      const linkUrl = await linkElement?.getAttribute("href");
37
38      const url = linkUrl ? await resolveBaiduLink(linkUrl) : "";
39
40      results.push({ title, description: linkUrl!, url });
41    }
42
43    const hotSearchResultElements = await page.$$("div[class^='toplist1-tr']");
44    const hotSearchResults: Link[] = [];
45    for (const hotSearchElement of hotSearchResultElements.slice(0, 16)) {
46      const title = (await hotSearchElement.textContent()) || "";
47      const linkElement = await hotSearchElement.$("a");
48      const url = (await linkElement?.getAttribute("href")) || "";
49
50      hotSearchResults.push({ title, url });
51    }
52
53    const relatedKeywordsElements = await page.$$(
54      "div[tpl='recommend_list'] .c-gap-top-xsmall"
55    );
56    const relatedSearchKeywords: string[] = [];
57    for (const relatedKeywordsElement of relatedKeywordsElements) {
58      const title = (await relatedKeywordsElement.textContent()) || "";
59      relatedSearchKeywords.push(title);
60    }
61
62    const similarKeywordsElements = await page.$$("a[class^='rs-link']");
63    const similarSearchKeywords: string[] = [];
64    for (const similarKeywordsElement of similarKeywordsElements) {
65      const title = (await similarKeywordsElement.getAttribute("title")) || "";
66      similarSearchKeywords.push(title);
67    }
68
69    const resultsCountElement = await page.$("span[class^='hint']");
70    const resultsCount = resolveResultsCount(
71      (await resultsCountElement?.textContent()) || ""
72    );
73
74    const dataStructure: DataResults = {
75      results,
76      hotSearchResults,
77      resultsCount,
78      similarSearchKeywords,
79      relatedSearchKeywords,
80    };
81
82    await Dataset.pushData([dataStructure]);
83  },
84});
85
86const { searchPhrases = ["apify", "nic"], pages = 1 } =
87  (await KeyValueStore.getInput<Input>()) ?? {};
88
89const pageArray = [...Array(pages).keys()];
90
91const requests = searchPhrases.flatMap((searchPhrase) =>
92  pageArray.map((pageNr) => {
93    const lookoutQuery = new URLSearchParams([
94      ["pn", pageNr.toString()],
95      ["wd", searchPhrase],
96    ]);
97    return `https://www.baidu.com/s?${lookoutQuery.toString()}`;
98  })
99);
100
101await crawler.addRequests(requests);
102await crawler.run();
103
104await Actor.exit();

src/utils.ts

1import axios from "axios";
2import cheerio from "cheerio";
3
4/**
5 * There's 2 discovered baidu links behaviors
6 * - redirect via server response 302,
7 * - timed-out custom redirect with custom HTML (status 200)
8 * @param url Baidu redirect url. Something like: http://www.baidu.com/link?url=someBasey64Chars
9 */
10export const resolveBaiduLink = async (url: string): Promise<string> => {
11  const { data, headers } = await axios.get<string>(url, {
12    maxRedirects: 0,
13    validateStatus: (status) => [302, 200].includes(status), // Baidu sometimes returns redirect 302 which leads to Axios error.
14  });
15
16  // Redirect location is enough when available (for 302 response)
17  if (headers.location) return headers.location;
18
19  // Baidu sometimes returns 200 with custom html - address is available in 'noscript' tag
20  const cheerioSelector = cheerio.load(data);
21
22  const noScriptElementContent = cheerioSelector("noscript").html();
23
24  if (!noScriptElementContent) return "";
25
26  const fullUrl = /(')(?:(?=(\\?))\2.)*?\1/.exec(noScriptElementContent);
27
28  return fullUrl?.[1]!;
29};
30
31/**
32 * Resolves results count string to number
33 * @param countText span tag text above first Baidu result, looks like '百度为您找到相关结果约740,000个'
34 */
35export const resolveResultsCount = (countText: string): number => {
36  const digitMatches = [...countText.matchAll(/\d+/g)]; // Thousands are comma-separated, matches will be in groups - ['740', '000']
37
38  const digitString = digitMatches.reduce(
39    (digitString, digitMatch) => digitString + digitMatch,
40    ""
41  );
42
43  return parseInt(digitString);
44};
Developer
Maintained by Community
Categories