
Xiaohongshu User Profile Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Xiaohongshu User Profile Scraper
Deprecated
Scrape data from user profile page (e.g. https://www.xiaohongshu.com/user/profile/56970f036a6a69031d3c6e15).
0.0 (0)
Pricing
Pay per usage
1
Total users
44
Monthly users
6
Last modified
2 years ago
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.gitignore
# This file tells Git which files shouldn't be added to source control.DS_Store.ideadistnode_modulesapify_storagestorage/*!storage/key_value_storesstorage/key_value_stores/*!storage/key_value_stores/defaultstorage/key_value_stores/default/*!storage/key_value_stores/default/INPUT.json
# Added by Apify CLIstorage.venv
package.json
{ "name": "xiaohongshu-user-profile-scraper", "version": "1.0", "type": "module", "description": "XiaoHongShu user profile scraper", "engines": { "node": ">=18.0.0" }, "dependencies": { "apify": "^3.1.10", "axios": "^1.5.0", "cheerio": "^1.0.0-rc.12", "crawlee": "^3.6.1" }, "scripts": { "start": "node ./src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "Hzml", "license": "ISC"}
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "xiaohongshu-user-profile-scraper", "title": "XiaoHongShu User Profile Scraper", "version": "1.0", "description": "Scrape data from XiaoHongShu user profile page.", "meta": {}, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": "./dataset_schema.json" }}
.actor/dataset_schema.json
{ "actorSpecification": 1, "fields": {}, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "basicInfo.nickname", "basicInfo.gender", "basicInfo.images", "basicInfo.redId", "basicInfo.ipLocation", "basicInfo.desc", "basicInfo.imageb", "interactions", "tags", "notes" ], "flatten": [ "basicInfo" ] }, "display": { "component": "table", "properties": { "basicInfo.nickname": { "label": "Nickname" }, "basicInfo.gender": { "label": "Gender", "format": "number" }, "interactions": { "label": "Interactions" }, "basicInfo.desc": { "label": "Description" }, "basicInfo.images": { "label": "Images URL", "format": "link" }, "basicInfo.redId": { "label": "Red ID" }, "basicInfo.ipLocation": { "label": "IP Location" }, "basicInfo.imageb": { "label": "Avatar", "format": "image" }, "tags": { "label": "Tags", "format": "array" }, "notes": { "label": "Notes", "format": "array" } } } } }}
.actor/input_schema.json
{ "title": "Scrape user profile data from XiaoHongShu", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "list of user profile urls", "type": "array", "description": "URLs to start with", "editor": "requestListSources", "prefill": [ { "url": "https://www.xiaohongshu.com/user/profile/5c9cb22d00000000110201ed" } ] } }, "required": [ "startUrls" ]}
src/main.js
1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).2import axios from "axios";3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).4import * as cheerio from "cheerio";5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).6import { Actor } from "apify";7// this is ESM project, and as such, it requires you to specify extensions in your relative imports8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions9// import { router } from './routes.js';10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().12await Actor.init();13
14// Structure of input is defined in input_schema.json15const input = await Actor.getInput();16console.debug("input", input);17
18let { startUrls } = input;19
20startUrls = startUrls21 .filter(({ url }) => {22 return !!url?.trim();23 })24 .map(({ url }) => url);25
26// filter duplicate urls27startUrls = [...new Set(startUrls)];28
29console.debug("filtered urls", startUrls);30
31/**32 * get profile html33 * @param {string} url34 */35const getProfileHtml = async (url) => {36 const { data } = await axios.get(url, {37 headers: {38 Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",39 "Accept-Language": "en",40 "User-Agent":41 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",42 },43 });44 return data;45};46
47/**48 * parse data from html49 * @param {string} htmlContent50 */51const parseDataFromHtml = async (htmlContent) => {52 const match = htmlContent.match(53 /window\.__INITIAL_STATE__=([\s\S]*?)<\/script>/54 );55 if (match) {56 try {57 let jsonData = match[1];58
59 // replace undefined with null60 jsonData = jsonData.replace(/(?<=:)\s*"?undefined"?\s*(?=[,}])/g, "null");61
62 // replace new Date with string63 jsonData = jsonData.replace(64 /"new Date\((\d+),(\d+),(\d+)\)"/g,65 '"$1-$2-$3"'66 );67
68 // replace NaN with null69 jsonData = jsonData.replace(/\\"/g, '\\\\"');70
71 // replace comments72 jsonData = jsonData.replace(/\/\/.*?\n|\/\*.*?\*\//g, "");73
74 // replace comma at the end of object75 jsonData = jsonData.replace(/,\s*([}\]])/g, "$1");76
77 return JSON.parse(jsonData);78 } catch (e) {79 console.error(e);80 return null;81 }82 } else {83 return null;84 }85};86
87const settledResults = await Promise.allSettled(88 startUrls.map(async (url) => {89 const profile_html = await getProfileHtml(url);90 const parsed_data = await parseDataFromHtml(profile_html);91 const userPageData = parsed_data?.["user"]?.["userPageData"];92 return {93 basicInfo: userPageData?.["basicInfo"],94 interactions: userPageData?.["interactions"],95 tags: userPageData?.["tags"],96 notes: parsed_data?.["user"]?.["notes"]?.[0],97 };98 })99);100
101const profiles = [];102
103settledResults.forEach((r) => {104 if (r.status === "fulfilled") {105 profiles.push(r.value);106 } else {107 console.log("error", r.reason);108 }109});110
111// Save headings to Dataset - a table-like storage.112await Actor.pushData(profiles);113
114// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().115await Actor.exit();