Xiaohongshu User Profile Scraper avatar
Xiaohongshu User Profile Scraper

Deprecated

Pricing

Pay per usage

Go to Store
Xiaohongshu User Profile Scraper

Xiaohongshu User Profile Scraper

Deprecated

Developed by

H hzml

H hzml

Maintained by Community

Scrape data from user profile page (e.g. https://www.xiaohongshu.com/user/profile/56970f036a6a69031d3c6e15).

0.0 (0)

Pricing

Pay per usage

1

Total users

44

Monthly users

6

Last modified

2 years ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage/*
!storage/key_value_stores
storage/key_value_stores/*
!storage/key_value_stores/default
storage/key_value_stores/default/*
!storage/key_value_stores/default/INPUT.json
# Added by Apify CLI
storage
.venv

package.json

{
"name": "xiaohongshu-user-profile-scraper",
"version": "1.0",
"type": "module",
"description": "XiaoHongShu user profile scraper",
"engines": {
"node": ">=18.0.0"
},
"dependencies": {
"apify": "^3.1.10",
"axios": "^1.5.0",
"cheerio": "^1.0.0-rc.12",
"crawlee": "^3.6.1"
},
"scripts": {
"start": "node ./src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "Hzml",
"license": "ISC"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "xiaohongshu-user-profile-scraper",
"title": "XiaoHongShu User Profile Scraper",
"version": "1.0",
"description": "Scrape data from XiaoHongShu user profile page.",
"meta": {},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": "./dataset_schema.json"
}
}

.actor/dataset_schema.json

{
"actorSpecification": 1,
"fields": {},
"views": {
"overview": {
"title": "Overview",
"transformation": {
"fields": [
"basicInfo.nickname",
"basicInfo.gender",
"basicInfo.images",
"basicInfo.redId",
"basicInfo.ipLocation",
"basicInfo.desc",
"basicInfo.imageb",
"interactions",
"tags",
"notes"
],
"flatten": [
"basicInfo"
]
},
"display": {
"component": "table",
"properties": {
"basicInfo.nickname": {
"label": "Nickname"
},
"basicInfo.gender": {
"label": "Gender",
"format": "number"
},
"interactions": {
"label": "Interactions"
},
"basicInfo.desc": {
"label": "Description"
},
"basicInfo.images": {
"label": "Images URL",
"format": "link"
},
"basicInfo.redId": {
"label": "Red ID"
},
"basicInfo.ipLocation": {
"label": "IP Location"
},
"basicInfo.imageb": {
"label": "Avatar",
"format": "image"
},
"tags": {
"label": "Tags",
"format": "array"
},
"notes": {
"label": "Notes",
"format": "array"
}
}
}
}
}
}

.actor/input_schema.json

{
"title": "Scrape user profile data from XiaoHongShu",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "list of user profile urls",
"type": "array",
"description": "URLs to start with",
"editor": "requestListSources",
"prefill": [
{
"url": "https://www.xiaohongshu.com/user/profile/5c9cb22d00000000110201ed"
}
]
}
},
"required": [
"startUrls"
]
}

src/main.js

1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
2import axios from "axios";
3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
4import * as cheerio from "cheerio";
5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
6import { Actor } from "apify";
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
12await Actor.init();
13
14// Structure of input is defined in input_schema.json
15const input = await Actor.getInput();
16console.debug("input", input);
17
18let { startUrls } = input;
19
20startUrls = startUrls
21 .filter(({ url }) => {
22 return !!url?.trim();
23 })
24 .map(({ url }) => url);
25
26// filter duplicate urls
27startUrls = [...new Set(startUrls)];
28
29console.debug("filtered urls", startUrls);
30
31/**
32 * get profile html
33 * @param {string} url
34 */
35const getProfileHtml = async (url) => {
36 const { data } = await axios.get(url, {
37 headers: {
38 Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 "Accept-Language": "en",
40 "User-Agent":
41 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
42 },
43 });
44 return data;
45};
46
47/**
48 * parse data from html
49 * @param {string} htmlContent
50 */
51const parseDataFromHtml = async (htmlContent) => {
52 const match = htmlContent.match(
53 /window\.__INITIAL_STATE__=([\s\S]*?)<\/script>/
54 );
55 if (match) {
56 try {
57 let jsonData = match[1];
58
59 // replace undefined with null
60 jsonData = jsonData.replace(/(?<=:)\s*"?undefined"?\s*(?=[,}])/g, "null");
61
62 // replace new Date with string
63 jsonData = jsonData.replace(
64 /"new Date\((\d+),(\d+),(\d+)\)"/g,
65 '"$1-$2-$3"'
66 );
67
68 // replace NaN with null
69 jsonData = jsonData.replace(/\\"/g, '\\\\"');
70
71 // replace comments
72 jsonData = jsonData.replace(/\/\/.*?\n|\/\*.*?\*\//g, "");
73
74 // replace comma at the end of object
75 jsonData = jsonData.replace(/,\s*([}\]])/g, "$1");
76
77 return JSON.parse(jsonData);
78 } catch (e) {
79 console.error(e);
80 return null;
81 }
82 } else {
83 return null;
84 }
85};
86
87const settledResults = await Promise.allSettled(
88 startUrls.map(async (url) => {
89 const profile_html = await getProfileHtml(url);
90 const parsed_data = await parseDataFromHtml(profile_html);
91 const userPageData = parsed_data?.["user"]?.["userPageData"];
92 return {
93 basicInfo: userPageData?.["basicInfo"],
94 interactions: userPageData?.["interactions"],
95 tags: userPageData?.["tags"],
96 notes: parsed_data?.["user"]?.["notes"]?.[0],
97 };
98 })
99);
100
101const profiles = [];
102
103settledResults.forEach((r) => {
104 if (r.status === "fulfilled") {
105 profiles.push(r.value);
106 } else {
107 console.log("error", r.reason);
108 }
109});
110
111// Save headings to Dataset - a table-like storage.
112await Actor.pushData(profiles);
113
114// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
115await Actor.exit();