
Fast Instagram Hashtag Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Fast Instagram Hashtag Scraper
Deprecated
Quickly scrape thousands of Instagram posts for the given hashtags.
0.0 (0)
Pricing
Pay per usage
6
Total users
645
Monthly users
1
Runs succeeded
>99%
Last modified
a year ago
.actor/actor.json
{ "actorSpecification": 1, "name": "fast-instagram-hashtag-scraper", "title": "Fast Instagram Hashtag Scraper", "description": "", //N<=200, optional, default "" "version": "0.0.14", "storages": { "dataset": { "actorSpecification": 1, "title": "Hashtags Dataset", "description": "", "views": { "overview": { "title": "Overview", "description": "It can take about one minute until the first results are available.", "transformation": { "fields": [ "hashtag", "shortcode", "comments", "text", "url", "taken_at_timestamp", "is_video", "thumbnail_src", "likes" ] }, "display": { "component": "table", "columns": [ { "label": "Post", "format": "link", "field": "$url", "textField": "$shortcode" }, { "label": "Thumb", "format": "image", "field": "$thumbnail_src" }, { "label": "Text", "format": "text", "field": "$text" }, { "label": "Likes", "format": "number", "field": "$likes" }, { "label": "Comments", "format": "number", "field": "$comments" }, { "label": "Date", "format": "text", "field": "$taken_at_timestamp" } ] } } } } }}
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify"}
.gitignore
# This file tells Git which files shouldn't be added to source control
.ideanode_modules
Dockerfile
# First, specify the base Docker image. You can read more about# the available images at https://sdk.apify.com/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:16
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --only=prod --no-optional --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Optionally, specify how to launch the source code of your actor.# By default, Apify's base Docker images define the CMD instruction# that runs the Node.js source code using the command specified# in the "scripts.start" section of the package.json file.# In short, the instruction looks something like this:## CMD npm start
INPUT_SCHEMA.json
{ "title": "Input schema for the apify_project actor.", "type": "object", "schemaVersion": 1, "properties": { "hashtags": { "title": "Hashtags", "description": "Hashtags to scrape", "prefill": ["apify"], "default": [], "type": "array", "editor": "stringList" }, "maxPosts": { "title": "Max posts", "description": "How many posts for each hashtag", "prefill": 12, "type": "integer", "editor": "number", "nullable": true } }, "required": [ "hashtags" ]}
apify.json
{ "env": { "npm_config_loglevel": "silent" }}
main.js
1const Apify = require('apify');2
3const { log } = Apify.utils;4
5/**6 * @param {string} value7 */8const getHashtagUrl = (value) => {9 if (!value) {10 return;11 }12
13 if (!value.includes('instagram.com/explore/tags/') && !/^#?[\p{L}\p{Nd}_]{1,50}$/u.test(value)) {14 log.warning(`Invalid hashtag: ${value}`);15 return;16 }17
18 let hashtag = '';19
20 if (value.includes('instagram.com')) {21 const match = value.trim().match(/\/explore\/tags\/(?<hashtag>[^/]+)\/?/u)?.groups?.hashtag;22
23 if (!match) {24 log.warning(`Invalid hashtag: ${value}`);25 return;26 }27
28 hashtag = match.trim();29 } else {30 hashtag = value.trim().replace(/#/g, '');31 }32
33 if (!hashtag) {34 return;35 }36 37 const url = new URL(`/explore/tags/${hashtag}/`, 'https://www.instagram.com/');38 url.searchParams.set('__a', '1');39 url.searchParams.set('__d', 'dis');40
41 return {42 url: url.toString(),43 hashtag,44 };45}46
47const filterMap = ({ addRequest, data, request }) => {48 const { edge_hashtag_to_media, edge_hashtag_to_top_posts, name } = data.graphql?.hashtag ?? data.data?.hashtag ?? data.hashtag;49
50 const dedup = {51 ...request.userData?.['ids'] ?? {}52 };53 54 if (edge_hashtag_to_media.page_info.has_next_page) {55 const { fallback } = request.userData;56
57 let url;58 const headers = {};59
60 if (fallback) {61 url = new URL('https://www.instagram.com/graphql/query/');62
63 url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');64
65 const variables = {66 tag_name: name,67 first: 12,68 after: edge_hashtag_to_media.page_info.end_cursor,69 };70
71 const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');72
73 const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {74 var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);75 return v.toString(16);76 }).toUpperCase();77
78 const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');79 const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');80
81 url.searchParams.set('variables', JSON.stringify(variables));82
83 Object.assign(headers, {84 referer: `https://www.instagram.com/explore/tags/${name}/`,85 'cookie': request.headers?.['cookie'] ?? request.headers?.['Cookie'] ?? `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,86 'X-CSRFToken': csrftoken,87 'X-IG-App-ID': '936619743392459',88 'X-ASBD-ID': '198387',89 'X-IG-WWW-Claim': '0',90 'X-Requested-With': 'XMLHttpRequest',91 });92 } else {93 url = new URL(request.url);94 url.searchParams.set('max_id', edge_hashtag_to_media.page_info.end_cursor);95 }96 97 addRequest({98 url: url.toString(),99 headers,100 userData: {101 hashtag: name,102 fallback,103 }104 });105 }106
107 const out = ({ node }) => node;108 109 const posts = [110 edge_hashtag_to_media.edges?.map?.(out) ?? [],111 edge_hashtag_to_top_posts?.map?.(out) ?? [],112 ].flat();113
114 const mappedPosts = posts.map((post) => {115 const { 116 shortcode, 117 edge_liked_by, 118 edge_media_to_caption,119 edge_media_to_comment, 120 edge_media_preview_like, 121 taken_at_timestamp,122 owner,123 ...rest124 } = post;125
126 if (shortcode in dedup) {127 return;128 }129
130 dedup[shortcode] = 1;131
132 return {133 hashtag: name,134 ...rest,135 owner: `${owner.id}`,136 taken_at_timestamp: taken_at_timestamp ? new Date(taken_at_timestamp * 1000).toISOString() : null,137 url: `https://www.instagram.com/p/${shortcode}/`,138 shortcode,139 comments: edge_media_to_comment?.count ?? 0,140 likes: edge_liked_by?.count ?? 0,141 text: (edge_media_to_caption?.edges?.map?.(({ node }) => node.text.trim())?.filter(Boolean).join('\n') ?? '').trim(),142 }143 }).filter(Boolean);144
145 request.userData.ids = dedup;146
147 return mappedPosts;148}149
150const handleError = async ({ request, addRequest, error }) => {151 if (error.message.includes('text/html')) {152 request.noRetry = true;153
154 const url = new URL('https://www.instagram.com/graphql/query/');155 url.searchParams.set('query_hash', '9b498c08113f1e09617a1703c22b2f32');156 const { hashtag } = request.userData;157
158 const variables = {159 tag_name: hashtag,160 first: 12,161 };162
163 url.searchParams.set('variables', JSON.stringify(variables));164
165 const randomize = (s) => s.split('').sort((a, b) => -1 * Math.random() * 2 << 0).join('');166 const csrftoken = randomize('161lzyaPD5dndZmkMIzNJZmiy0mjg3aQ');167 const uuid = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {168 var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);169 return v.toString(16);170 }).toUpperCase(); 171 172 const mid = randomize('YtDTlgALAAEC8pwTfToW8F233jaU');173
174 addRequest({175 url: url.toString(),176 headers: {177 referer: `https://www.instagram.com/explore/tags/${hashtag}/`,178 'cookie': `csrftoken=${csrftoken}; mid=${mid}; ig_did=${uuid}; ig_nrcb=1`,179 'X-CSRFToken': csrftoken,180 'X-IG-App-ID': '936619743392459',181 'X-ASBD-ID': '198387',182 'X-IG-WWW-Claim': '0',183 'X-Requested-With': 'XMLHttpRequest',184 },185 userData: {186 hashtag,187 fallback: true,188 }189 });190 }191}192
193/**194 * @param {string} s195 */196const splitSpaces = (s) => s.split(' ');197
198Apify.main(async () => {199 const { hashtags = [], maxPosts } = await Apify.getInput();200
201 const startUrls = hashtags.flatMap(splitSpaces).map(getHashtagUrl).filter(Boolean);202
203 if (!startUrls.length) {204 throw new Error('You need to provide hashtags to scrape');205 }206
207 log.info(`${startUrls.length} hashtags, will scrape at most ${startUrls.length * maxPosts} posts`);208
209 await Apify.metamorph('pocesar/json-downloader-base', {210 startUrls: startUrls.map(({ url, hashtag }) => ({211 url,212 userData: {213 hashtag,214 },215 })),216 filterMap,217 maxRequestsPerCrawl: maxPosts,218 maxRequestRetries: 3,219 handleError,220 proxyConfig: {221 useApifyProxy: true,222 apifyProxyGroups: ["RESIDENTIAL"]223 },224 silent: true,225 debugLog: false226 });227});
package.json
{ "name": "project-empty", "version": "0.0.1", "description": "This is a boilerplate of an Apify actor.", "dependencies": { "apify": "^2.3.2" }, "scripts": { "start": "node main.js" }, "author": "It's not you it's me", "license": "ISC"}