
Twitter Hashtag Scraper
Deprecated
Pricing
Pay per usage
Go to Store

Twitter Hashtag Scraper
Deprecated
#️⃣ Scrape tweets by hashtag. Download your data as HTML table, JSON, CSV, Excel, XML, and RSS feed.
0.0 (0)
Pricing
Pay per usage
5
Total users
489
Monthly users
1
Last modified
4 years ago
Dockerfile
# Dockerfile contains instructions how to build a Docker image that# will contain all the code and configuration needed to run your actor.# For a full Dockerfile reference,# see https://docs.docker.com/engine/reference/builder/
# First, specify the base Docker image. Apify provides the following# base images for your convenience:# apify/actor-node-basic (Node.js on Alpine Linux, small and fast)# apify/actor-node-chrome (Node.js + Chrome on Debian)# apify/actor-node-chrome-xvfb (Node.js + Chrome + Xvfb on Debian)# For more information, see https://docs.apify.com/actor/build#base-images# Note that you can use any other image from Docker Hub.FROM apify/actor-node-chrome
# Second, copy just package.json since it should be the only file# that affects "npm install" in the next step, to speed up the buildCOPY package.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && npm list || true \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Optionally, specify how to launch the source code of your actor.# By default, Apify's base Docker images define the CMD instruction# that runs the Node.js source code using the command specified# in the "scripts.start" section of the package.json file.# In short, the instruction looks something like this:## CMD npm start
INPUT_SCHEMA.json
{ "title": "Twitter Hashtag", "type": "object", "schemaVersion": 1, "properties": { "hashtag": { "title": "Hashtag", "type": "string", "description": "Hashtag to search twitter", "editor": "textfield" }, "proxyConfig": { "title": "Proxy configuration", "type": "object", "description": "Optionally use Apify Proxy", "prefill": { "useApifyProxy": true, "apifyProxyGroups": ["SHADER"] }, "editor": "proxy" }, "extendOutputFunction": { "title": "Extend output function", "type": "string", "nullable": true, "description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output", "prefill": "($) => { return {} }", "editor": "javascript" } }, "required": [ "hashtag", "proxyConfig" ]}
main.js
1const Apify = require('apify');2
3Apify.main(async () => {4 const userAgent = 'Mozilla/5.0 (Linux; U; Android 3.2; nl-nl; GT-P6800 Build/HTJ85B) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13';5 const { hashtag, extendOutputFunction, proxyConfig } = await Apify.getInput();6 7 // Enqueue first URL.8 const requestList = await Apify.openRequestList('hashtags', [9 { url: `https://mobile.twitter.com/search?q=%23${hashtag}` }10 ]);11 const requestQueue = await Apify.openRequestQueue();12 const proxyConfiguration = await Apify.createProxyConfiguration({13 ...proxyConfig,14 });15 // Open twitter with JS disabled to ble able to switch to new version,16 // save cookies (with switch to old Twitter version) and close the browser.17 const browser = await Apify.launchPuppeteer({18 proxyUrl: proxyConfiguration.newUrl(),19 userAgent20 });21 const page = await browser.newPage();22 await page.setJavaScriptEnabled(false);23 await page.goto('https://mobile.twitter.com/home', {24 waitUntil: 'networkidle0',25 timeout: 3000026 });27 await Apify.utils.sleep(1000);28 const cookies = await page.cookies();29 await browser.close();30
31 const crawler = new Apify.PuppeteerCrawler({32 requestList,33 requestQueue,34 useSessionPool: true,35 proxyConfiguration,36 launchPuppeteerOptions: {37 userAgent,38 stealth: true,39 },40 41 // Here we apply our cookies from step 1.42 gotoFunction: async ({ page, request, session, puppeteerPool }) => {43 await page.setCookie(...cookies); 44 45 try {46 return page.goto(request.url, {47 waitUntil: 'networkidle0',48 });49 } catch (e) {50 session.retire();51 await puppeteerPool.retire(page.browser());52
53 throw e;54 }55 },56
57 handlePageFunction: async ({ page, request }) => {58 console.log(`Processing ${request.url}...`);59
60 await Apify.utils.puppeteer.injectJQuery(page);61
62 // Enqueue next page63 try {64 const nextHref = await page.$eval('.w-button-more a', el => el.href);65 await requestQueue.addRequest({ url: nextHref });66 } catch (err) {67 console.log(`Url ${request.url} is the last page!`);68 }69
70 // Extract data.71 const pageFunction = ($tweets) => {72 const data = [];73
74 $tweets.forEach(($tweet) => {75 data.push({76 username: $tweet.querySelector('.username').innerText,77 tweet: $tweet.querySelector('.tweet-text').innerText,78 });79 });80
81 return data;82 };83
84 const data = await page.$$eval('table.tweet', pageFunction);85 86 const userFnData = extendOutputFunction ? await page.evaluate(async (fn) => {87 const result = eval(fn)(window.jQuery);88 return typeof result === 'object' ? result : {};89 }, extendOutputFunction): {};90
91 await Apify.pushData(data.map((res) => ({ ...res, ...userFnData })));92 },93
94 handleFailedRequestFunction: async ({ request }) => {95 console.log(`Request ${request.url} failed 4 times`);96 },97 });98
99 await crawler.run();100});
package.json
{ "name": "my-actor", "version": "0.0.1", "dependencies": { "apify": "^0.21.0" }, "scripts": { "start": "node main.js" }, "author": "Me!"}