Xiaohongshu User Profile Scraper
Go to Store
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsXiaohongshu User Profile Scraper
hzml/xiaohongshu-user-profile-scraper
Scrape data from user profile page (e.g. https://www.xiaohongshu.com/user/profile/56970f036a6a69031d3c6e15).
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.gitignore
1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14# Added by Apify CLI
15storage
16.venv
package.json
1{
2 "name": "xiaohongshu-user-profile-scraper",
3 "version": "1.0",
4 "type": "module",
5 "description": "XiaoHongShu user profile scraper",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "apify": "^3.1.10",
11 "axios": "^1.5.0",
12 "cheerio": "^1.0.0-rc.12",
13 "crawlee": "^3.6.1"
14 },
15 "scripts": {
16 "start": "node ./src/main.js",
17 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18 },
19 "author": "Hzml",
20 "license": "ISC"
21}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "xiaohongshu-user-profile-scraper",
4 "title": "XiaoHongShu User Profile Scraper",
5 "version": "1.0",
6 "description": "Scrape data from XiaoHongShu user profile page.",
7 "meta": {},
8 "input": "./input_schema.json",
9 "dockerfile": "./Dockerfile",
10 "storages": {
11 "dataset": "./dataset_schema.json"
12 }
13}
.actor/dataset_schema.json
1{
2 "actorSpecification": 1,
3 "fields": {},
4 "views": {
5 "overview": {
6 "title": "Overview",
7 "transformation": {
8 "fields": [
9 "basicInfo.nickname",
10 "basicInfo.gender",
11 "basicInfo.images",
12 "basicInfo.redId",
13 "basicInfo.ipLocation",
14 "basicInfo.desc",
15 "basicInfo.imageb",
16 "interactions",
17 "tags",
18 "notes"
19 ],
20 "flatten": [
21 "basicInfo"
22 ]
23 },
24 "display": {
25 "component": "table",
26 "properties": {
27 "basicInfo.nickname": {
28 "label": "Nickname"
29 },
30 "basicInfo.gender": {
31 "label": "Gender",
32 "format": "number"
33 },
34 "interactions": {
35 "label": "Interactions"
36 },
37 "basicInfo.desc": {
38 "label": "Description"
39 },
40 "basicInfo.images": {
41 "label": "Images URL",
42 "format": "link"
43 },
44 "basicInfo.redId": {
45 "label": "Red ID"
46 },
47 "basicInfo.ipLocation": {
48 "label": "IP Location"
49 },
50 "basicInfo.imageb": {
51 "label": "Avatar",
52 "format": "image"
53 },
54 "tags": {
55 "label": "Tags",
56 "format": "array"
57 },
58 "notes": {
59 "label": "Notes",
60 "format": "array"
61 }
62 }
63 }
64 }
65 }
66}
.actor/input_schema.json
1{
2 "title": "Scrape user profile data from XiaoHongShu",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "list of user profile urls",
8 "type": "array",
9 "description": "URLs to start with",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://www.xiaohongshu.com/user/profile/5c9cb22d00000000110201ed"
14 }
15 ]
16 }
17 },
18 "required": [
19 "startUrls"
20 ]
21}
src/main.js
1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
2import axios from "axios";
3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
4import * as cheerio from "cheerio";
5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
6import { Actor } from "apify";
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
12await Actor.init();
13
14// Structure of input is defined in input_schema.json
15const input = await Actor.getInput();
16console.debug("input", input);
17
18let { startUrls } = input;
19
20startUrls = startUrls
21 .filter(({ url }) => {
22 return !!url?.trim();
23 })
24 .map(({ url }) => url);
25
26// filter duplicate urls
27startUrls = [...new Set(startUrls)];
28
29console.debug("filtered urls", startUrls);
30
31/**
32 * get profile html
33 * @param {string} url
34 */
35const getProfileHtml = async (url) => {
36 const { data } = await axios.get(url, {
37 headers: {
38 Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 "Accept-Language": "en",
40 "User-Agent":
41 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
42 },
43 });
44 return data;
45};
46
47/**
48 * parse data from html
49 * @param {string} htmlContent
50 */
51const parseDataFromHtml = async (htmlContent) => {
52 const match = htmlContent.match(
53 /window\.__INITIAL_STATE__=([\s\S]*?)<\/script>/
54 );
55 if (match) {
56 try {
57 let jsonData = match[1];
58
59 // replace undefined with null
60 jsonData = jsonData.replace(/(?<=:)\s*"?undefined"?\s*(?=[,}])/g, "null");
61
62 // replace new Date with string
63 jsonData = jsonData.replace(
64 /"new Date\((\d+),(\d+),(\d+)\)"/g,
65 '"$1-$2-$3"'
66 );
67
68 // replace NaN with null
69 jsonData = jsonData.replace(/\\"/g, '\\\\"');
70
71 // replace comments
72 jsonData = jsonData.replace(/\/\/.*?\n|\/\*.*?\*\//g, "");
73
74 // replace comma at the end of object
75 jsonData = jsonData.replace(/,\s*([}\]])/g, "$1");
76
77 return JSON.parse(jsonData);
78 } catch (e) {
79 console.error(e);
80 return null;
81 }
82 } else {
83 return null;
84 }
85};
86
87const settledResults = await Promise.allSettled(
88 startUrls.map(async (url) => {
89 const profile_html = await getProfileHtml(url);
90 const parsed_data = await parseDataFromHtml(profile_html);
91 const userPageData = parsed_data?.["user"]?.["userPageData"];
92 return {
93 basicInfo: userPageData?.["basicInfo"],
94 interactions: userPageData?.["interactions"],
95 tags: userPageData?.["tags"],
96 notes: parsed_data?.["user"]?.["notes"]?.[0],
97 };
98 })
99);
100
101const profiles = [];
102
103settledResults.forEach((r) => {
104 if (r.status === "fulfilled") {
105 profiles.push(r.value);
106 } else {
107 console.log("error", r.reason);
108 }
109});
110
111// Save headings to Dataset - a table-like storage.
112await Actor.pushData(profiles);
113
114// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
115await Actor.exit();
Developer
Maintained by Community
Categories