Baidu Scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

Baidu Scraper

Baidu Scraper

jan.turon/baidu-scraper

No credit card required

This actor allows you automatically crawl the result page for your keyword for the Baidu Search engine

.DS_Store

Download

.eslintrc

1{ 2 "extends": "@apify" 3} 4

.gitignore

1apify_storage 2node_modules 3.idea 4dist 5memory_storage 6crawlee_storage 7

Dockerfile

1FROM apify/actor-node-playwright-chrome:16 2 3RUN npm -v 4COPY . . 5 6RUN npm i 7RUN npx playwright install 8RUN npm run build 9

INPUT_SCHEMA.json

1{ 2 "title": "Baidu Scraper", 3 "description": "Scraper for Chinese Search Engine Baidu", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "searchPhrases": { 8 "title": "Search Phrases", 9 "type": "array", 10 "description": "Phrases to search on Baidu", 11 "prefill": ["Apify"], 12 "editor": "stringList" 13 }, 14 "pages": { 15 "title": "Pages", 16 "type": "integer", 17 "description": "Number of pages to search", 18 "editor": "number" 19 } 20 }, 21 "required": ["searchPhrases"] 22} 23

apify.json.deprecated

1{ 2 "name": "baidu-scraper", 3 "version": "0.0", 4 "buildTag": "latest", 5 "env": null 6} 7

baidu-scraper.zip

Download

package.json

1{ 2 "name": "baidu-scraper", 3 "version": "0.0.1", 4 "description": "Apify scraper for Baidu search engine", 5 "main": "dist/main.js", 6 "scripts": { 7 "build": "tsc -p tsconfig.json && tsc-alias", 8 "start": "node --experimental-specifier-resolution=node dist/main.js", 9 "buildAndRun": "rm -rf dist crawlee_storage && npm run build && npm run start", 10 "publish": "npm run build && apify push" 11 }, 12 "keywords": [ 13 "apify" 14 ], 15 "author": "Jan Turoň", 16 "license": "ISC", 17 "dependencies": { 18 "@apify/tsconfig": "^0.1.0", 19 "@crawlee/playwright": "^3.0.3", 20 "apify": "^3.0.2", 21 "axios": "^0.27.2", 22 "cheerio": "^1.0.0-rc.12", 23 "crawlee": "^3.0.3", 24 "playwright": "^1.25.0", 25 "tsc-alias": "^1.7.0", 26 "typescript": "^4.7.4" 27 }, 28 "type": "module", 29 "devDependencies": { 30 "@types/node": "^17.0.41", 31 "apify-cli": "^0.7.4", 32 "husky": "^8.0.1", 33 "prettier": "^2.7.1" 34 }, 35 "optionalDependencies": { 36 "fsevents": "^2.3.2" 37 }, 38 "engines": { 39 "node": ">=16" 40 } 41} 42

tsconfig.json

1{ 2 "extends": "@apify/tsconfig", 3 "compilerOptions": { 4 "module": "ES2022", 5 "sourceMap": false, 6 "declaration": false, 7 "declarationMap": false, 8 "target": "ES2022", 9 "outDir": "dist", 10 "lib": ["DOM"], 11 "baseUrl": "src", 12 "paths": { "@/*": ["./*"] } 13 }, 14 "include": ["src"] 15} 16

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "baidu-scraper", 4 "version": "0.0", 5 "buildTag": "latest" 6} 7

.husky/pre-commit

1#!/usr/bin/env sh 2. "$(dirname -- "$0")/_/husky.sh" 3

src/main.ts

1import { resolveBaiduLink, resolveResultsCount } from "@/utils"; 2import { KeyValueStore, Dataset, Actor } from "apify"; 3import { PlaywrightCrawler } from "crawlee"; 4 5interface Link { 6 url: string; 7 title: string; 8} 9 10interface ResultLink extends Link { 11 description: string; 12} 13 14interface DataResults { 15 results: ResultLink[]; 16 hotSearchResults?: Link[]; 17 relatedSearchKeywords?: string[]; 18 similarSearchKeywords?: string[]; 19 resultsCount?: number; 20} 21 22interface Input { 23 searchPhrases: string[]; 24 pages?: number; 25} 26 27await Actor.init(); 28 29const crawler = new PlaywrightCrawler({ 30 requestHandler: async ({ page }) => { 31 const resultElements = await page.$$("h3"); 32 const results: ResultLink[] = []; 33 for (const titleElement of resultElements) { 34 const title = (await titleElement.textContent()) || ""; 35 const linkElement = await titleElement.$("a"); 36 const linkUrl = await linkElement?.getAttribute("href"); 37 38 const url = linkUrl ? await resolveBaiduLink(linkUrl) : ""; 39 40 results.push({ title, description: linkUrl!, url }); 41 } 42 43 const hotSearchResultElements = await page.$$("div[class^='toplist1-tr']"); 44 const hotSearchResults: Link[] = []; 45 for (const hotSearchElement of hotSearchResultElements.slice(0, 16)) { 46 const title = (await hotSearchElement.textContent()) || ""; 47 const linkElement = await hotSearchElement.$("a"); 48 const url = (await linkElement?.getAttribute("href")) || ""; 49 50 hotSearchResults.push({ title, url }); 51 } 52 53 const relatedKeywordsElements = await page.$$( 54 "div[tpl='recommend_list'] .c-gap-top-xsmall" 55 ); 56 const relatedSearchKeywords: string[] = []; 57 for (const relatedKeywordsElement of relatedKeywordsElements) { 58 const title = (await relatedKeywordsElement.textContent()) || ""; 59 relatedSearchKeywords.push(title); 60 } 61 62 const similarKeywordsElements = await page.$$("a[class^='rs-link']"); 63 const similarSearchKeywords: string[] = []; 64 for (const similarKeywordsElement of similarKeywordsElements) { 65 const title = (await similarKeywordsElement.getAttribute("title")) || ""; 66 similarSearchKeywords.push(title); 67 } 68 69 const resultsCountElement = await page.$("span[class^='hint']"); 70 const resultsCount = resolveResultsCount( 71 (await resultsCountElement?.textContent()) || "" 72 ); 73 74 const dataStructure: DataResults = { 75 results, 76 hotSearchResults, 77 resultsCount, 78 similarSearchKeywords, 79 relatedSearchKeywords, 80 }; 81 82 await Dataset.pushData([dataStructure]); 83 }, 84}); 85 86const { searchPhrases = ["apify", "nic"], pages = 1 } = 87 (await KeyValueStore.getInput<Input>()) ?? {}; 88 89const pageArray = [...Array(pages).keys()]; 90 91const requests = searchPhrases.flatMap((searchPhrase) => 92 pageArray.map((pageNr) => { 93 const lookoutQuery = new URLSearchParams([ 94 ["pn", pageNr.toString()], 95 ["wd", searchPhrase], 96 ]); 97 return `https://www.baidu.com/s?${lookoutQuery.toString()}`; 98 }) 99); 100 101await crawler.addRequests(requests); 102await crawler.run(); 103 104await Actor.exit(); 105

src/utils.ts

1import axios from "axios"; 2import cheerio from "cheerio"; 3 4/** 5 * There's 2 discovered baidu links behaviors 6 * - redirect via server response 302, 7 * - timed-out custom redirect with custom HTML (status 200) 8 * @param url Baidu redirect url. Something like: http://www.baidu.com/link?url=someBasey64Chars 9 */ 10export const resolveBaiduLink = async (url: string): Promise<string> => { 11 const { data, headers } = await axios.get<string>(url, { 12 maxRedirects: 0, 13 validateStatus: (status) => [302, 200].includes(status), // Baidu sometimes returns redirect 302 which leads to Axios error. 14 }); 15 16 // Redirect location is enough when available (for 302 response) 17 if (headers.location) return headers.location; 18 19 // Baidu sometimes returns 200 with custom html - address is available in 'noscript' tag 20 const cheerioSelector = cheerio.load(data); 21 22 const noScriptElementContent = cheerioSelector("noscript").html(); 23 24 if (!noScriptElementContent) return ""; 25 26 const fullUrl = /(')(?:(?=(\\?))\2.)*?\1/.exec(noScriptElementContent); 27 28 return fullUrl?.[1]!; 29}; 30 31/** 32 * Resolves results count string to number 33 * @param countText span tag text above first Baidu result, looks like '百度为您找到相关结果约740,000个' 34 */ 35export const resolveResultsCount = (countText: string): number => { 36 const digitMatches = [...countText.matchAll(/\d+/g)]; // Thousands are comma-separated, matches will be in groups - ['740', '000'] 37 38 const digitString = digitMatches.reduce( 39 (digitString, digitMatch) => digitString + digitMatch, 40 "" 41 ); 42 43 return parseInt(digitString); 44}; 45
Developer
Maintained by Community
Actor stats
  • 40 users
  • 709 runs
  • Modified 5 days ago
Categories

You might also like these Actors