1
2import axios from "axios";
3
4import * as cheerio from "cheerio";
5
6import { Actor } from "apify";
7
8
9
10
11
12await Actor.init();
13
14
15const input = await Actor.getInput();
16console.debug("input", input);
17
18let { startUrls } = input;
19
20startUrls = startUrls
21 .filter(({ url }) => {
22 return !!url?.trim();
23 })
24 .map(({ url }) => url);
25
26
27startUrls = [...new Set(startUrls)];
28
29console.debug("filtered urls", startUrls);
30
31
32
33
34
35const getProfileHtml = async (url) => {
36 const { data } = await axios.get(url, {
37 headers: {
38 Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
39 "Accept-Language": "en",
40 "User-Agent":
41 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
42 },
43 });
44 return data;
45};
46
47
48
49
50
51const parseDataFromHtml = async (htmlContent) => {
52 const match = htmlContent.match(
53 /window\.__INITIAL_STATE__=([\s\S]*?)<\/script>/
54 );
55 if (match) {
56 try {
57 let jsonData = match[1];
58
59
60 jsonData = jsonData.replace(/(?<=:)\s*"?undefined"?\s*(?=[,}])/g, "null");
61
62
63 jsonData = jsonData.replace(
64 /"new Date\((\d+),(\d+),(\d+)\)"/g,
65 '"$1-$2-$3"'
66 );
67
68
69 jsonData = jsonData.replace(/\\"/g, '\\\\"');
70
71
72 jsonData = jsonData.replace(/\/\/.*?\n|\/\*.*?\*\//g, "");
73
74
75 jsonData = jsonData.replace(/,\s*([}\]])/g, "$1");
76
77 return JSON.parse(jsonData);
78 } catch (e) {
79 console.error(e);
80 return null;
81 }
82 } else {
83 return null;
84 }
85};
86
87const settledResults = await Promise.allSettled(
88 startUrls.map(async (url) => {
89 const profile_html = await getProfileHtml(url);
90 const parsed_data = await parseDataFromHtml(profile_html);
91 const userPageData = parsed_data?.["user"]?.["userPageData"];
92 return {
93 basicInfo: userPageData?.["basicInfo"],
94 interactions: userPageData?.["interactions"],
95 tags: userPageData?.["tags"],
96 notes: parsed_data?.["user"]?.["notes"]?.[0],
97 };
98 })
99);
100
101const profiles = [];
102
103settledResults.forEach((r) => {
104 if (r.status === "fulfilled") {
105 profiles.push(r.value);
106 } else {
107 console.log("error", r.reason);
108 }
109});
110
111
112await Actor.pushData(profiles);
113
114
115await Actor.exit();