Fast Instagram Hashtag Scraper avatar
Fast Instagram Hashtag Scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Fast Instagram Hashtag Scraper

Fast Instagram Hashtag Scraper

pocesar/fast-instagram-hashtag-scraper

Quickly scrape thousands of Instagram posts for the given hashtags.

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "fast-instagram-hashtag-scraper",
4    "title": "Fast Instagram Hashtag Scraper",
5    "description": "", //N<=200, optional, default ""
6    "version": "0.0.14",
7    "storages": {
8        "dataset": {
9            "actorSpecification": 1,
10            "title": "Hashtags Dataset",
11            "description": "",
12            "views": {
13                "overview": {
14                    "title": "Overview",
15                    "description": "It can take about one minute until the first results are available.",
16                    "transformation": {
17                        "fields": [
18                            "hashtag",
19                            "shortcode",
20                            "comments",
21                            "text",
22                            "url",
23                            "taken_at_timestamp",
24                            "is_video",
25                            "thumbnail_src",
26                            "likes"
27                        ]
28                    },
29                    "display": {
30                        "component": "table",
31                        "columns": [
32                            {
33                                "label": "Post",
34                                "format": "link",
35                                "field": "$url",
36                                "textField": "$shortcode"
37                            },
38                            {
39                                "label": "Thumb",
40                                "format": "image",
41                                "field": "$thumbnail_src"
42                            },
43                            {
44                                "label": "Text",
45                                "format": "text",
46                                "field": "$text"
47                            },
48                            {
49                                "label": "Likes",
50                                "format": "number",
51                                "field": "$likes"
52                            },
53                            {
54                                "label": "Comments",
55                                "format": "number",
56                                "field": "$comments"
57                            },
58                            {
59                                "label": "Date",
60                                "format": "text",
61                                "field": "$taken_at_timestamp"
62                            }
63                        ]
64                    }
65                }
66            }
67        }
68    }
69}

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify"
3}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4node_modules

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --only=prod --no-optional --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input schema for the apify_project actor.",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "hashtags": {
7            "title": "Hashtags",
8            "description": "Hashtags to scrape",
9            "prefill": ["apify"],
10            "default": [],
11            "type": "array",
12            "editor": "stringList"
13        },
14        "maxPosts": {
15            "title": "Max posts",
16            "description": "How many posts for each hashtag",
17            "prefill": 12,
18            "type": "integer",
19            "editor": "number",
20            "nullable": true
21        }
22    },
23    "required": [
24        "hashtags"
25    ]
26}

apify.json

1{
2    "env": { "npm_config_loglevel": "silent" }
3}

main.js

1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5/**
6 * @param {string} value
7 */
8const getHashtagUrl = (value) => {
9    if (!value) {
10        return;
11    }
12
13    if (!value.includes('instagram.com/explore/tags/') && !/^#?[\p{L}\p{Nd}_]{1,50}$/u.test(value)) {
14        log.warning(`Invalid hashtag: ${value}`);
15        return;
16    }
17
18    let hashtag = '';
19
20    if (value.includes('instagram.com')) {
21        const match = value.trim().match(/\/explore\/tags\/(?<hashtag>[^/]+)\/?/u)?.groups?.hashtag;
22
23        if (!match) {
24            log.warning(`Invalid hashtag: ${value}`);
25            return;
26        }
27
28        hashtag = match.trim();
29    } else {
30        hashtag = value.trim().replace(/#/g, '');
31    }
32
33    if (!hashtag) {
34        return;
35    }
36    
37    const url = new URL(`/explore/tags/${hashtag}/`, 'https://www.instagram.com/');
38    url.searchParams.set('__a', '1');
39    url.searchParams.set('__d', 'dis');
40
41    return {
42        url: url.toString(),
43        hashtag,
44    };
45}
46
47const filterMap = ({ addRequest, data, request }) => {
48    const { edge_hashtag_to_media, edge_hashtag_to_top_posts, name } = data.graphql?.hashtag ?? data.data?.hashtag ?? data.hashtag;
49
50    const dedup = {
51        ...request.userData?.['ids'] ?? {}
52    };
53    
54    if (edge_hashtag_to_media.page_info.has_next_page) {
55        const { fallback } = request.userData;
56
57        let url;
58        const headers = {};
59
60        if (fallback) {
61            url = new URL('https://www.instagram.com/graphql/query/');
62
63            url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
64
65            const variables = {
66                tag_name: name,
67                first: 12,
68                after: edge_hashtag_to_media.page_info.end_cursor,
69            };
70
71            const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
72
73            const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
74                var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
75                return v.toString(16);
76            }).toUpperCase();
77
78            const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
79            const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
80
81            url.searchParams.set('variables', JSON.stringify(variables));
82
83            Object.assign(headers, {
84                referer: `https://www.instagram.com/explore/tags/${name}/`,
85                'cookie': request.headers?.['cookie'] ?? request.headers?.['Cookie'] ?? `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
86                'X-CSRFToken': csrftoken,
87                'X-IG-App-ID': '936619743392459',
88                'X-ASBD-ID': '198387',
89                'X-IG-WWW-Claim': '0',
90                'X-Requested-With': 'XMLHttpRequest',
91            });
92        } else {
93            url = new URL(request.url);
94            url.searchParams.set('max_id', edge_hashtag_to_media.page_info.end_cursor);
95        }
96        
97        addRequest({
98            url: url.toString(),
99            headers,
100            userData: {
101                hashtag: name,
102                fallback,
103            }
104        });
105    }
106
107    const out = ({ node }) => node;
108    
109    const posts = [
110        edge_hashtag_to_media.edges?.map?.(out) ?? [],
111        edge_hashtag_to_top_posts?.map?.(out) ?? [],
112    ].flat();
113
114    const mappedPosts = posts.map((post) => {
115        const { 
116            shortcode, 
117            edge_liked_by, 
118            edge_media_to_caption,
119            edge_media_to_comment, 
120            edge_media_preview_like, 
121            taken_at_timestamp,
122            owner,
123            ...rest
124        } = post;
125
126        if (shortcode in dedup) {
127            return;
128        }
129
130        dedup[shortcode] = 1;
131
132        return {
133            hashtag: name,
134            ...rest,
135            owner: `${owner.id}`,
136            taken_at_timestamp: taken_at_timestamp ? new Date(taken_at_timestamp * 1000).toISOString() : null,
137            url: `https://www.instagram.com/p/${shortcode}/`,
138            shortcode,
139            comments: edge_media_to_comment?.count ?? 0,
140            likes: edge_liked_by?.count ?? 0,
141            text: (edge_media_to_caption?.edges?.map?.(({ node }) => node.text.trim())?.filter(Boolean).join('\n') ?? '').trim(),
142        }
143    }).filter(Boolean);
144
145    request.userData.ids = dedup;
146
147    return mappedPosts;
148}
149
150const handleError = async ({ request, addRequest, error }) => {
151    if (error.message.includes('text/html')) {
152        request.noRetry = true;
153
154        const url = new URL('https://www.instagram.com/graphql/query/');
155        url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
156        const { hashtag } = request.userData;
157
158        const variables = {
159            tag_name: hashtag,
160            first: 12,
161        };
162
163        url.searchParams.set('variables', JSON.stringify(variables));
164
165        const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
166        const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
167        const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
168            var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
169            return v.toString(16);
170        }).toUpperCase(); 
171        
172        const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
173
174        addRequest({
175            url: url.toString(),
176            headers: {
177                referer: `https://www.instagram.com/explore/tags/${hashtag}/`,
178                'cookie': `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
179                'X-CSRFToken': csrftoken,
180                'X-IG-App-ID': '936619743392459',
181                'X-ASBD-ID': '198387',
182                'X-IG-WWW-Claim': '0',
183                'X-Requested-With': 'XMLHttpRequest',
184            },
185            userData: {
186                hashtag,
187                fallback: true,
188            }
189        });
190    }
191}
192
193/**
194 * @param {string} s
195 */
196const splitSpaces = (s) => s.split(' ');
197
198Apify.main(async () => {
199    const { hashtags = [], maxPosts } = await Apify.getInput();
200
201    const startUrls = hashtags.flatMap(splitSpaces).map(getHashtagUrl).filter(Boolean);
202
203    if (!startUrls.length) {
204        throw new Error('You need to provide hashtags to scrape');
205    }
206
207    log.info(`${startUrls.length} hashtags, will scrape at most ${startUrls.length * maxPosts} posts`);
208
209    await Apify.metamorph('pocesar/json-downloader-base', {
210        startUrls: startUrls.map(({ url, hashtag }) => ({
211            url,
212            userData: {
213                hashtag,
214            },
215        })),
216        filterMap,
217        maxRequestsPerCrawl: maxPosts,
218        maxRequestRetries: 3,
219        handleError,
220        proxyConfig: {
221            useApifyProxy: true,
222            apifyProxyGroups: ["RESIDENTIAL"]
223        },
224        silent: true,
225        debugLog: false
226    });
227});

package.json

1{
2    "name": "project-empty",
3    "version": "0.0.1",
4    "description": "This is a boilerplate of an Apify actor.",
5    "dependencies": {
6        "apify": "^2.3.2"
7    },
8    "scripts": {
9        "start": "node main.js"
10    },
11    "author": "It's not you it's me",
12    "license": "ISC"
13}
Developer
Maintained by Community
Categories