Fast Instagram Hashtag Scraper avatar
Fast Instagram Hashtag Scraper
Deprecated

Pricing

Pay per usage

Go to Store
Fast Instagram Hashtag Scraper

Fast Instagram Hashtag Scraper

Deprecated
pocesar/fast-instagram-hashtag-scraper

Developed by

Paulo Cesar

Maintained by Community

Quickly scrape thousands of Instagram posts for the given hashtags.

0.0 (0)

Pricing

Pay per usage

6

Monthly users

2

Runs succeeded

>99%

Last modified

a year ago

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "fast-instagram-hashtag-scraper",
4    "title": "Fast Instagram Hashtag Scraper",
5    "description": "", //N<=200, optional, default ""
6    "version": "0.0.14",
7    "storages": {
8        "dataset": {
9            "actorSpecification": 1,
10            "title": "Hashtags Dataset",
11            "description": "",
12            "views": {
13                "overview": {
14                    "title": "Overview",
15                    "description": "It can take about one minute until the first results are available.",
16                    "transformation": {
17                        "fields": [
18                            "hashtag",
19                            "shortcode",
20                            "comments",
21                            "text",
22                            "url",
23                            "taken_at_timestamp",
24                            "is_video",
25                            "thumbnail_src",
26                            "likes"
27                        ]
28                    },
29                    "display": {
30                        "component": "table",
31                        "columns": [
32                            {
33                                "label": "Post",
34                                "format": "link",
35                                "field": "$url",
36                                "textField": "$shortcode"
37                            },
38                            {
39                                "label": "Thumb",
40                                "format": "image",
41                                "field": "$thumbnail_src"
42                            },
43                            {
44                                "label": "Text",
45                                "format": "text",
46                                "field": "$text"
47                            },
48                            {
49                                "label": "Likes",
50                                "format": "number",
51                                "field": "$likes"
52                            },
53                            {
54                                "label": "Comments",
55                                "format": "number",
56                                "field": "$comments"
57                            },
58                            {
59                                "label": "Date",
60                                "format": "text",
61                                "field": "$taken_at_timestamp"
62                            }
63                        ]
64                    }
65                }
66            }
67        }
68    }
69}

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify"
3}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.idea
4node_modules

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --only=prod --no-optional --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input schema for the apify_project actor.",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "hashtags": {
7            "title": "Hashtags",
8            "description": "Hashtags to scrape",
9            "prefill": ["apify"],
10            "default": [],
11            "type": "array",
12            "editor": "stringList"
13        },
14        "maxPosts": {
15            "title": "Max posts",
16            "description": "How many posts for each hashtag",
17            "prefill": 12,
18            "type": "integer",
19            "editor": "number",
20            "nullable": true
21        }
22    },
23    "required": [
24        "hashtags"
25    ]
26}

apify.json

1{
2    "env": { "npm_config_loglevel": "silent" }
3}

main.js

1const Apify = require('apify');
2
3const { log } = Apify.utils;
4
5/**
6 * @param {string} value
7 */
8const getHashtagUrl = (value) => {
9    if (!value) {
10        return;
11    }
12
13    if (!value.includes('instagram.com/explore/tags/') && !/^#?[\p{L}\p{Nd}_]{1,50}$/u.test(value)) {
14        log.warning(`Invalid hashtag: ${value}`);
15        return;
16    }
17
18    let hashtag = '';
19
20    if (value.includes('instagram.com')) {
21        const match = value.trim().match(/\/explore\/tags\/(?<hashtag>[^/]+)\/?/u)?.groups?.hashtag;
22
23        if (!match) {
24            log.warning(`Invalid hashtag: ${value}`);
25            return;
26        }
27
28        hashtag = match.trim();
29    } else {
30        hashtag = value.trim().replace(/#/g, '');
31    }
32
33    if (!hashtag) {
34        return;
35    }
36    
37    const url = new URL(`/explore/tags/${hashtag}/`, 'https://www.instagram.com/');
38    url.searchParams.set('__a', '1');
39    url.searchParams.set('__d', 'dis');
40
41    return {
42        url: url.toString(),
43        hashtag,
44    };
45}
46
47const filterMap = ({ addRequest, data, request }) => {
48    const { edge_hashtag_to_media, edge_hashtag_to_top_posts, name } = data.graphql?.hashtag ?? data.data?.hashtag ?? data.hashtag;
49
50    const dedup = {
51        ...request.userData?.['ids'] ?? {}
52    };
53    
54    if (edge_hashtag_to_media.page_info.has_next_page) {
55        const { fallback } = request.userData;
56
57        let url;
58        const headers = {};
59
60        if (fallback) {
61            url = new URL('https://www.instagram.com/graphql/query/');
62
63            url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
64
65            const variables = {
66                tag_name: name,
67                first: 12,
68                after: edge_hashtag_to_media.page_info.end_cursor,
69            };
70
71            const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
72
73            const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
74                var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
75                return v.toString(16);
76            }).toUpperCase();
77
78            const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
79            const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
80
81            url.searchParams.set('variables', JSON.stringify(variables));
82
83            Object.assign(headers, {
84                referer: `https://www.instagram.com/explore/tags/${name}/`,
85                'cookie': request.headers?.['cookie'] ?? request.headers?.['Cookie'] ?? `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
86                'X-CSRFToken': csrftoken,
87                'X-IG-App-ID': '936619743392459',
88                'X-ASBD-ID': '198387',
89                'X-IG-WWW-Claim': '0',
90                'X-Requested-With': 'XMLHttpRequest',
91            });
92        } else {
93            url = new URL(request.url);
94            url.searchParams.set('max_id', edge_hashtag_to_media.page_info.end_cursor);
95        }
96        
97        addRequest({
98            url: url.toString(),
99            headers,
100            userData: {
101                hashtag: name,
102                fallback,
103            }
104        });
105    }
106
107    const out = ({ node }) => node;
108    
109    const posts = [
110        edge_hashtag_to_media.edges?.map?.(out) ?? [],
111        edge_hashtag_to_top_posts?.map?.(out) ?? [],
112    ].flat();
113
114    const mappedPosts = posts.map((post) => {
115        const { 
116            shortcode, 
117            edge_liked_by, 
118            edge_media_to_caption,
119            edge_media_to_comment, 
120            edge_media_preview_like, 
121            taken_at_timestamp,
122            owner,
123            ...rest
124        } = post;
125
126        if (shortcode in dedup) {
127            return;
128        }
129
130        dedup[shortcode] = 1;
131
132        return {
133            hashtag: name,
134            ...rest,
135            owner: `${owner.id}`,
136            taken_at_timestamp: taken_at_timestamp ? new Date(taken_at_timestamp * 1000).toISOString() : null,
137            url: `https://www.instagram.com/p/${shortcode}/`,
138            shortcode,
139            comments: edge_media_to_comment?.count ?? 0,
140            likes: edge_liked_by?.count ?? 0,
141            text: (edge_media_to_caption?.edges?.map?.(({ node }) => node.text.trim())?.filter(Boolean).join('\n') ?? '').trim(),
142        }
143    }).filter(Boolean);
144
145    request.userData.ids = dedup;
146
147    return mappedPosts;
148}
149
150const handleError = async ({ request, addRequest, error }) => {
151    if (error.message.includes('text/html')) {
152        request.noRetry = true;
153
154        const url = new URL('https://www.instagram.com/graphql/query/');
155        url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');
156        const { hashtag } = request.userData;
157
158        const variables = {
159            tag_name: hashtag,
160            first: 12,
161        };
162
163        url.searchParams.set('variables', JSON.stringify(variables));
164
165        const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');
166        const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');
167        const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
168            var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
169            return v.toString(16);
170        }).toUpperCase(); 
171        
172        const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');
173
174        addRequest({
175            url: url.toString(),
176            headers: {
177                referer: `https://www.instagram.com/explore/tags/${hashtag}/`,
178                'cookie': `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,
179                'X-CSRFToken': csrftoken,
180                'X-IG-App-ID': '936619743392459',
181                'X-ASBD-ID': '198387',
182                'X-IG-WWW-Claim': '0',
183                'X-Requested-With': 'XMLHttpRequest',
184            },
185            userData: {
186                hashtag,
187                fallback: true,
188            }
189        });
190    }
191}
192
193/**
194 * @param {string} s
195 */
196const splitSpaces = (s) => s.split(' ');
197
198Apify.main(async () => {
199    const { hashtags = [], maxPosts } = await Apify.getInput();
200
201    const startUrls = hashtags.flatMap(splitSpaces).map(getHashtagUrl).filter(Boolean);
202
203    if (!startUrls.length) {
204        throw new Error('You need to provide hashtags to scrape');
205    }
206
207    log.info(`${startUrls.length} hashtags, will scrape at most ${startUrls.length * maxPosts} posts`);
208
209    await Apify.metamorph('pocesar/json-downloader-base', {
210        startUrls: startUrls.map(({ url, hashtag }) => ({
211            url,
212            userData: {
213                hashtag,
214            },
215        })),
216        filterMap,
217        maxRequestsPerCrawl: maxPosts,
218        maxRequestRetries: 3,
219        handleError,
220        proxyConfig: {
221            useApifyProxy: true,
222            apifyProxyGroups: ["RESIDENTIAL"]
223        },
224        silent: true,
225        debugLog: false
226    });
227});

package.json

1{
2    "name": "project-empty",
3    "version": "0.0.1",
4    "description": "This is a boilerplate of an Apify actor.",
5    "dependencies": {
6        "apify": "^2.3.2"
7    },
8    "scripts": {
9        "start": "node main.js"
10    },
11    "author": "It's not you it's me",
12    "license": "ISC"
13}

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.