Twitter Hashtag Scraper avatar
Twitter Hashtag Scraper

Deprecated

Pricing

Pay per usage

Go to Store
Twitter Hashtag Scraper

Twitter Hashtag Scraper

Deprecated

Developed by

Marek Trunkát

Marek Trunkát

Maintained by Community

#️⃣ Scrape tweets by hashtag. Download your data as HTML table, JSON, CSV, Excel, XML, and RSS feed.

0.0 (0)

Pricing

Pay per usage

5

Total users

489

Monthly users

1

Last modified

4 years ago

Dockerfile

# Dockerfile contains instructions how to build a Docker image that
# will contain all the code and configuration needed to run your actor.
# For a full Dockerfile reference,
# see https://docs.docker.com/engine/reference/builder/
# First, specify the base Docker image. Apify provides the following
# base images for your convenience:
# apify/actor-node-basic (Node.js on Alpine Linux, small and fast)
# apify/actor-node-chrome (Node.js + Chrome on Debian)
# apify/actor-node-chrome-xvfb (Node.js + Chrome + Xvfb on Debian)
# For more information, see https://docs.apify.com/actor/build#base-images
# Note that you can use any other image from Docker Hub.
FROM apify/actor-node-chrome
# Second, copy just package.json since it should be the only file
# that affects "npm install" in the next step, to speed up the build
COPY package.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& npm list || true \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
"title": "Twitter Hashtag",
"type": "object",
"schemaVersion": 1,
"properties": {
"hashtag": {
"title": "Hashtag",
"type": "string",
"description": "Hashtag to search twitter",
"editor": "textfield"
},
"proxyConfig": {
"title": "Proxy configuration",
"type": "object",
"description": "Optionally use Apify Proxy",
"prefill": { "useApifyProxy": true, "apifyProxyGroups": ["SHADER"] },
"editor": "proxy"
},
"extendOutputFunction": {
"title": "Extend output function",
"type": "string",
"nullable": true,
"description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output",
"prefill": "($) => { return {} }",
"editor": "javascript"
}
},
"required": [
"hashtag",
"proxyConfig"
]
}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4 const userAgent = 'Mozilla/5.0 (Linux; U; Android 3.2; nl-nl; GT-P6800 Build/HTJ85B) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13';
5 const { hashtag, extendOutputFunction, proxyConfig } = await Apify.getInput();
6
7 // Enqueue first URL.
8 const requestList = await Apify.openRequestList('hashtags', [
9 { url: `https://mobile.twitter.com/search?q=%23${hashtag}` }
10 ]);
11 const requestQueue = await Apify.openRequestQueue();
12 const proxyConfiguration = await Apify.createProxyConfiguration({
13 ...proxyConfig,
14 });
15 // Open twitter with JS disabled to ble able to switch to new version,
16 // save cookies (with switch to old Twitter version) and close the browser.
17 const browser = await Apify.launchPuppeteer({
18 proxyUrl: proxyConfiguration.newUrl(),
19 userAgent
20 });
21 const page = await browser.newPage();
22 await page.setJavaScriptEnabled(false);
23 await page.goto('https://mobile.twitter.com/home', {
24 waitUntil: 'networkidle0',
25 timeout: 30000
26 });
27 await Apify.utils.sleep(1000);
28 const cookies = await page.cookies();
29 await browser.close();
30
31 const crawler = new Apify.PuppeteerCrawler({
32 requestList,
33 requestQueue,
34 useSessionPool: true,
35 proxyConfiguration,
36 launchPuppeteerOptions: {
37 userAgent,
38 stealth: true,
39 },
40
41 // Here we apply our cookies from step 1.
42 gotoFunction: async ({ page, request, session, puppeteerPool }) => {
43 await page.setCookie(...cookies);
44
45 try {
46 return page.goto(request.url, {
47 waitUntil: 'networkidle0',
48 });
49 } catch (e) {
50 session.retire();
51 await puppeteerPool.retire(page.browser());
52
53 throw e;
54 }
55 },
56
57 handlePageFunction: async ({ page, request }) => {
58 console.log(`Processing ${request.url}...`);
59
60 await Apify.utils.puppeteer.injectJQuery(page);
61
62 // Enqueue next page
63 try {
64 const nextHref = await page.$eval('.w-button-more a', el => el.href);
65 await requestQueue.addRequest({ url: nextHref });
66 } catch (err) {
67 console.log(`Url ${request.url} is the last page!`);
68 }
69
70 // Extract data.
71 const pageFunction = ($tweets) => {
72 const data = [];
73
74 $tweets.forEach(($tweet) => {
75 data.push({
76 username: $tweet.querySelector('.username').innerText,
77 tweet: $tweet.querySelector('.tweet-text').innerText,
78 });
79 });
80
81 return data;
82 };
83
84 const data = await page.$$eval('table.tweet', pageFunction);
85
86 const userFnData = extendOutputFunction ? await page.evaluate(async (fn) => {
87 const result = eval(fn)(window.jQuery);
88 return typeof result === 'object' ? result : {};
89 }, extendOutputFunction): {};
90
91 await Apify.pushData(data.map((res) => ({ ...res, ...userFnData })));
92 },
93
94 handleFailedRequestFunction: async ({ request }) => {
95 console.log(`Request ${request.url} failed 4 times`);
96 },
97 });
98
99 await crawler.run();
100});

package.json

{
"name": "my-actor",
"version": "0.0.1",
"dependencies": {
"apify": "^0.21.0"
},
"scripts": {
"start": "node main.js"
},
"author": "Me!"
}