Xiaohongshu User Profile Scraper avatar

Xiaohongshu User Profile Scraper

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Xiaohongshu User Profile Scraper

Xiaohongshu User Profile Scraper

hzml/xiaohongshu-user-profile-scraper

Scrape data from user profile page (e.g. https://www.xiaohongshu.com/user/profile/56970f036a6a69031d3c6e15).

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.gitignore

1# This file tells Git which files shouldn't be added to source control
2.DS_Store
3.idea
4dist
5node_modules
6apify_storage
7storage/*
8!storage/key_value_stores
9storage/key_value_stores/*
10!storage/key_value_stores/default
11storage/key_value_stores/default/*
12!storage/key_value_stores/default/INPUT.json
13
14# Added by Apify CLI
15storage
16.venv

package.json

1{
2	"name": "xiaohongshu-user-profile-scraper",
3	"version": "1.0",
4	"type": "module",
5	"description": "XiaoHongShu user profile scraper",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"apify": "^3.1.10",
11		"axios": "^1.5.0",
12		"cheerio": "^1.0.0-rc.12",
13		"crawlee": "^3.6.1"
14	},
15	"scripts": {
16		"start": "node ./src/main.js",
17		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
18	},
19	"author": "Hzml",
20	"license": "ISC"
21}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "xiaohongshu-user-profile-scraper",
4	"title": "XiaoHongShu User Profile Scraper",
5	"version": "1.0",
6	"description": "Scrape data from XiaoHongShu user profile page.",
7	"meta": {},
8	"input": "./input_schema.json",
9	"dockerfile": "./Dockerfile",
10	"storages": {
11		"dataset": "./dataset_schema.json"
12	}
13}

.actor/dataset_schema.json

1{
2    "actorSpecification": 1,
3    "fields": {},
4    "views": {
5        "overview": {
6            "title": "Overview",
7            "transformation": {
8                "fields": [
9                    "basicInfo.nickname",
10                    "basicInfo.gender",
11                    "basicInfo.images",
12                    "basicInfo.redId",
13                    "basicInfo.ipLocation",
14                    "basicInfo.desc",
15                    "basicInfo.imageb",
16                    "interactions",
17                    "tags",
18                    "notes"
19                ],
20                "flatten": [
21                    "basicInfo"
22                ]
23            },
24            "display": {
25                "component": "table",
26                "properties": {
27                    "basicInfo.nickname": {
28                        "label": "Nickname"
29                    },
30                    "basicInfo.gender": {
31                        "label": "Gender",
32                        "format": "number"
33                    },
34                    "interactions": {
35                        "label": "Interactions"
36                    },
37                    "basicInfo.desc": {
38                        "label": "Description"
39                    },
40                    "basicInfo.images": {
41                        "label": "Images URL",
42                        "format": "link"
43                    },
44                    "basicInfo.redId": {
45                        "label": "Red ID"
46                    },
47                    "basicInfo.ipLocation": {
48                        "label": "IP Location"
49                    },
50                    "basicInfo.imageb": {
51                        "label": "Avatar",
52                        "format": "image"
53                    },
54                    "tags": {
55                        "label": "Tags",
56                        "format": "array"
57                    },
58                    "notes": {
59                        "label": "Notes",
60                        "format": "array"
61                    }
62                }
63            }
64        }
65    }
66}

.actor/input_schema.json

1{
2    "title": "Scrape user profile data from XiaoHongShu",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "list of user profile urls",
8            "type": "array",
9            "description": "URLs to start with",
10            "editor": "requestListSources",
11            "prefill": [
12                {
13                    "url": "https://www.xiaohongshu.com/user/profile/5c9cb22d00000000110201ed"
14                }
15            ]
16        }
17    },
18    "required": [
19        "startUrls"
20    ]
21}

src/main.js

1// Axios - Promise based HTTP client for the browser and node.js (Read more at https://axios-http.com/docs/intro).
2import axios from "axios";
3// Cheerio - The fast, flexible & elegant library for parsing and manipulating HTML and XML (Read more at https://cheerio.js.org/).
4import * as cheerio from "cheerio";
5// Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/js/).
6import { Actor } from "apify";
7// this is ESM project, and as such, it requires you to specify extensions in your relative imports
8// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
9// import { router } from './routes.js';
10
11// The init() call configures the Actor for its environment. It's recommended to start every Actor with an init().
12await Actor.init();
13
14// Structure of input is defined in input_schema.json
15const input = await Actor.getInput();
16console.debug("input", input);
17
18let { startUrls } = input;
19
20startUrls = startUrls
21  .filter(({ url }) => {
22    return !!url?.trim();
23  })
24  .map(({ url }) => url);
25
26// filter duplicate urls
27startUrls = [...new Set(startUrls)];
28
29console.debug("filtered urls", startUrls);
30
31/**
32 * get profile html
33 * @param {string} url
34 */
35const getProfileHtml = async (url) => {
36  const { data } = await axios.get(url, {
37    headers: {
38      Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39      "Accept-Language": "en",
40      "User-Agent":
41        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
42    },
43  });
44  return data;
45};
46
47/**
48 *  parse data from html
49 * @param {string} htmlContent
50 */
51const parseDataFromHtml = async (htmlContent) => {
52  const match = htmlContent.match(
53    /window\.__INITIAL_STATE__=([\s\S]*?)<\/script>/
54  );
55  if (match) {
56    try {
57      let jsonData = match[1];
58
59      // replace undefined with null
60      jsonData = jsonData.replace(/(?<=:)\s*"?undefined"?\s*(?=[,}])/g, "null");
61
62      // replace new Date with string
63      jsonData = jsonData.replace(
64        /"new Date\((\d+),(\d+),(\d+)\)"/g,
65        '"$1-$2-$3"'
66      );
67
68      // replace NaN with null
69      jsonData = jsonData.replace(/\\"/g, '\\\\"');
70
71      // replace comments
72      jsonData = jsonData.replace(/\/\/.*?\n|\/\*.*?\*\//g, "");
73
74      // replace comma at the end of object
75      jsonData = jsonData.replace(/,\s*([}\]])/g, "$1");
76
77      return JSON.parse(jsonData);
78    } catch (e) {
79      console.error(e);
80      return null;
81    }
82  } else {
83    return null;
84  }
85};
86
87const settledResults = await Promise.allSettled(
88  startUrls.map(async (url) => {
89    const profile_html = await getProfileHtml(url);
90    const parsed_data = await parseDataFromHtml(profile_html);
91    const userPageData = parsed_data?.["user"]?.["userPageData"];
92    return {
93      basicInfo: userPageData?.["basicInfo"],
94      interactions: userPageData?.["interactions"],
95      tags: userPageData?.["tags"],
96      notes: parsed_data?.["user"]?.["notes"]?.[0],
97    };
98  })
99);
100
101const profiles = [];
102
103settledResults.forEach((r) => {
104  if (r.status === "fulfilled") {
105    profiles.push(r.value);
106  } else {
107    console.log("error", r.reason);
108  }
109});
110
111// Save headings to Dataset - a table-like storage.
112await Actor.pushData(profiles);
113
114// Gracefully exit the Actor process. It's recommended to quit all Actors with an exit().
115await Actor.exit();
Developer
Maintained by Community