
Fast Instagram Hashtag Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Fast Instagram Hashtag Scraper
Deprecated
pocesar/fast-instagram-hashtag-scraper
Quickly scrape thousands of Instagram posts for the given hashtags.
0.0 (0)
Pricing
Pay per usage
6
Monthly users
2
Runs succeeded
>99%
Last modified
a year ago
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "fast-instagram-hashtag-scraper",
4 "title": "Fast Instagram Hashtag Scraper",
5 "description": "", //N<=200, optional, default ""
6 "version": "0.0.14",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "Hashtags Dataset",
11 "description": "",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "It can take about one minute until the first results are available.",
16 "transformation": {
17 "fields": [
18 "hashtag",
19 "shortcode",
20 "comments",
21 "text",
22 "url",
23 "taken_at_timestamp",
24 "is_video",
25 "thumbnail_src",
26 "likes"
27 ]
28 },
29 "display": {
30 "component": "table",
31 "columns": [
32 {
33 "label": "Post",
34 "format": "link",
35 "field": "$url",
36 "textField": "$shortcode"
37 },
38 {
39 "label": "Thumb",
40 "format": "image",
41 "field": "$thumbnail_src"
42 },
43 {
44 "label": "Text",
45 "format": "text",
46 "field": "$text"
47 },
48 {
49 "label": "Likes",
50 "format": "number",
51 "field": "$likes"
52 },
53 {
54 "label": "Comments",
55 "format": "number",
56 "field": "$comments"
57 },
58 {
59 "label": "Date",
60 "format": "text",
61 "field": "$taken_at_timestamp"
62 }
63 ]
64 }
65 }
66 }
67 }
68 }
69}
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify"
3}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4node_modules
Dockerfile
1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --only=prod --no-optional --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start
INPUT_SCHEMA.json
1{
2 "title": "Input schema for the apify_project actor.",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "hashtags": {
7 "title": "Hashtags",
8 "description": "Hashtags to scrape",
9 "prefill": ["apify"],
10 "default": [],
11 "type": "array",
12 "editor": "stringList"
13 },
14 "maxPosts": {
15 "title": "Max posts",
16 "description": "How many posts for each hashtag",
17 "prefill": 12,
18 "type": "integer",
19 "editor": "number",
20 "nullable": true
21 }
22 },
23 "required": [
24 "hashtags"
25 ]
26}
apify.json
1{
2 "env": { "npm_config_loglevel": "silent" }
3}
main.js
1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5/**
6 * @param {string} value
7 */
8const getHashtagUrl = (value) => {
9 if (!value) {
10 return;
11 }
12
13 if (!value.includes('instagram.com/explore/tags/') && !/^#?[\p{L}\p{Nd}_]{1,50}$/u.test(value)) {
14 log.warning(`Invalid hashtag: ${value}`);
15 return;
16 }
17
18 let hashtag = '';
19
20 if (value.includes('instagram.com')) {
21 const match = value.trim().match(/\/explore\/tags\/(?<hashtag>[^/]+)\/?/u)?.groups?.hashtag;
22
23 if (!match) {
24 log.warning(`Invalid hashtag: ${value}`);
25 return;
26 }
27
28 hashtag = match.trim();
29 } else {
30 hashtag = value.trim().replace(/#/g, '');
31 }
32
33 if (!hashtag) {
34 return;
35 }
36
37 const url = new URL(`/explore/tags/${hashtag}/`, 'https://www.instagram.com/');
38 url.searchParams.set('__a', '1');
39 url.searchParams.set('__d', 'dis');
40
41 return {
42 url: url.toString(),
43 hashtag,
44 };
45}
46
47const filterMap = ({ addRequest, data, request }) => {
48 const { edge_hashtag_to_media, edge_hashtag_to_top_posts, name } = data.graphql?.hashtag ?? data.data?.hashtag ?? data.hashtag;
49
50 const dedup = {
51 ...request.userData?.['ids'] ?? {}
52 };
53
54 if (edge_hashtag_to_media.page_info.has_next_page) {
55 const { fallback } = request.userData;
56
57 let url;
58 const headers = {};
59
60 if (fallback) {
61 url = new URL('https://www.instagram.com/graphql/query/');
62
63 url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
64
65 const variables = {
66 tag_name: name,
67 first: 12,
68 after: edge_hashtag_to_media.page_info.end_cursor,
69 };
70
71 const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
72
73 const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
74 var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
75 return v.toString(16);
76 }).toUpperCase();
77
78 const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
79 const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
80
81 url.searchParams.set('variables', JSON.stringify(variables));
82
83 Object.assign(headers, {
84 referer: `https://www.instagram.com/explore/tags/${name}/`,
85 'cookie': request.headers?.['cookie'] ?? request.headers?.['Cookie'] ?? `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
86 'X-CSRFToken': csrftoken,
87 'X-IG-App-ID': '936619743392459',
88 'X-ASBD-ID': '198387',
89 'X-IG-WWW-Claim': '0',
90 'X-Requested-With': 'XMLHttpRequest',
91 });
92 } else {
93 url = new URL(request.url);
94 url.searchParams.set('max_id', edge_hashtag_to_media.page_info.end_cursor);
95 }
96
97 addRequest({
98 url: url.toString(),
99 headers,
100 userData: {
101 hashtag: name,
102 fallback,
103 }
104 });
105 }
106
107 const out = ({ node }) => node;
108
109 const posts = [
110 edge_hashtag_to_media.edges?.map?.(out) ?? [],
111 edge_hashtag_to_top_posts?.map?.(out) ?? [],
112 ].flat();
113
114 const mappedPosts = posts.map((post) => {
115 const {
116 shortcode,
117 edge_liked_by,
118 edge_media_to_caption,
119 edge_media_to_comment,
120 edge_media_preview_like,
121 taken_at_timestamp,
122 owner,
123 ...rest
124 } = post;
125
126 if (shortcode in dedup) {
127 return;
128 }
129
130 dedup[shortcode] = 1;
131
132 return {
133 hashtag: name,
134 ...rest,
135 owner: `${owner.id}`,
136 taken_at_timestamp: taken_at_timestamp ? new Date(taken_at_timestamp * 1000).toISOString() : null,
137 url: `https://www.instagram.com/p/${shortcode}/`,
138 shortcode,
139 comments: edge_media_to_comment?.count ?? 0,
140 likes: edge_liked_by?.count ?? 0,
141 text: (edge_media_to_caption?.edges?.map?.(({ node }) => node.text.trim())?.filter(Boolean).join('\n') ?? '').trim(),
142 }
143 }).filter(Boolean);
144
145 request.userData.ids = dedup;
146
147 return mappedPosts;
148}
149
150const handleError = async ({ request, addRequest, error }) => {
151 if (error.message.includes('text/html')) {
152 request.noRetry = true;
153
154 const url = new URL('https://www.instagram.com/graphql/query/');
155 url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
156 const { hashtag } = request.userData;
157
158 const variables = {
159 tag_name: hashtag,
160 first: 12,
161 };
162
163 url.searchParams.set('variables', JSON.stringify(variables));
164
165 const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
166 const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
167 const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
168 var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
169 return v.toString(16);
170 }).toUpperCase();
171
172 const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
173
174 addRequest({
175 url: url.toString(),
176 headers: {
177 referer: `https://www.instagram.com/explore/tags/${hashtag}/`,
178 'cookie': `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
179 'X-CSRFToken': csrftoken,
180 'X-IG-App-ID': '936619743392459',
181 'X-ASBD-ID': '198387',
182 'X-IG-WWW-Claim': '0',
183 'X-Requested-With': 'XMLHttpRequest',
184 },
185 userData: {
186 hashtag,
187 fallback: true,
188 }
189 });
190 }
191}
192
193/**
194 * @param {string} s
195 */
196const splitSpaces = (s) => s.split(' ');
197
198Apify.main(async () => {
199 const { hashtags = [], maxPosts } = await Apify.getInput();
200
201 const startUrls = hashtags.flatMap(splitSpaces).map(getHashtagUrl).filter(Boolean);
202
203 if (!startUrls.length) {
204 throw new Error('You need to provide hashtags to scrape');
205 }
206
207 log.info(`${startUrls.length} hashtags, will scrape at most ${startUrls.length * maxPosts} posts`);
208
209 await Apify.metamorph('pocesar/json-downloader-base', {
210 startUrls: startUrls.map(({ url, hashtag }) => ({
211 url,
212 userData: {
213 hashtag,
214 },
215 })),
216 filterMap,
217 maxRequestsPerCrawl: maxPosts,
218 maxRequestRetries: 3,
219 handleError,
220 proxyConfig: {
221 useApifyProxy: true,
222 apifyProxyGroups: ["RESIDENTIAL"]
223 },
224 silent: true,
225 debugLog: false
226 });
227});
package.json
1{
2 "name": "project-empty",
3 "version": "0.0.1",
4 "description": "This is a boilerplate of an Apify actor.",
5 "dependencies": {
6 "apify": "^2.3.2"
7 },
8 "scripts": {
9 "start": "node main.js"
10 },
11 "author": "It's not you it's me",
12 "license": "ISC"
13}
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.