Twitter Hashtag Scraper avatar
Twitter Hashtag Scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Twitter Hashtag Scraper

Twitter Hashtag Scraper

mtrunkat/twitter

#️⃣ Scrape tweets by hashtag. Download your data as HTML table, JSON, CSV, Excel, XML, and RSS feed.

Dockerfile

1# Dockerfile contains instructions how to build a Docker image that
2# will contain all the code and configuration needed to run your actor.
3# For a full Dockerfile reference,
4# see https://docs.docker.com/engine/reference/builder/
5
6# First, specify the base Docker image. Apify provides the following
7# base images for your convenience:
8#  apify/actor-node-basic (Node.js on Alpine Linux, small and fast)
9#  apify/actor-node-chrome (Node.js + Chrome on Debian)
10#  apify/actor-node-chrome-xvfb (Node.js + Chrome + Xvfb on Debian)
11# For more information, see https://docs.apify.com/actor/build#base-images
12# Note that you can use any other image from Docker Hub.
13FROM apify/actor-node-chrome
14
15# Second, copy just package.json since it should be the only file
16# that affects "npm install" in the next step, to speed up the build
17COPY package.json ./
18
19# Install NPM packages, skip optional and development dependencies to
20# keep the image small. Avoid logging too much and print the dependency
21# tree for debugging
22RUN npm --quiet set progress=false \
23 && npm install --only=prod --no-optional \
24 && echo "Installed NPM packages:" \
25 && npm list || true \
26 && echo "Node.js version:" \
27 && node --version \
28 && echo "NPM version:" \
29 && npm --version
30
31# Next, copy the remaining files and directories with the source code.
32# Since we do this after NPM install, quick build will be really fast
33# for most source file changes.
34COPY . ./
35
36# Optionally, specify how to launch the source code of your actor.
37# By default, Apify's base Docker images define the CMD instruction
38# that runs the Node.js source code using the command specified
39# in the "scripts.start" section of the package.json file.
40# In short, the instruction looks something like this:
41#
42# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Twitter Hashtag",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "hashtag": {
7            "title": "Hashtag",
8            "type": "string",
9            "description": "Hashtag to search twitter",
10            "editor": "textfield"
11        },
12        "proxyConfig": {
13            "title": "Proxy configuration",
14            "type": "object",
15            "description": "Optionally use Apify Proxy",
16            "prefill": { "useApifyProxy": true, "apifyProxyGroups": ["SHADER"] },
17            "editor": "proxy"
18        },
19        "extendOutputFunction": {
20            "title": "Extend output function",
21            "type": "string",
22            "nullable": true,
23            "description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output",
24            "prefill": "($) => { return {} }",
25            "editor": "javascript"
26        }
27    },
28    "required": [
29        "hashtag",
30        "proxyConfig"
31    ]
32}

main.js

1const Apify = require('apify');
2
3Apify.main(async () => {
4    const userAgent = 'Mozilla/5.0 (Linux; U; Android 3.2; nl-nl; GT-P6800 Build/HTJ85B) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13';
5    const { hashtag, extendOutputFunction, proxyConfig } = await Apify.getInput();
6    
7    // Enqueue first URL.
8    const requestList = await Apify.openRequestList('hashtags', [
9        { url: `https://mobile.twitter.com/search?q=%23${hashtag}` }
10    ]);
11    const requestQueue = await Apify.openRequestQueue();
12    const proxyConfiguration = await Apify.createProxyConfiguration({
13        ...proxyConfig,
14    });
15    // Open twitter with JS disabled to ble able to switch to new version,
16    // save cookies (with switch to old Twitter version) and close the browser.
17    const browser = await Apify.launchPuppeteer({
18        proxyUrl: proxyConfiguration.newUrl(),
19        userAgent
20    });
21    const page = await browser.newPage();
22    await page.setJavaScriptEnabled(false);
23    await page.goto('https://mobile.twitter.com/home', {
24        waitUntil: 'networkidle0',
25        timeout: 30000
26    });
27    await Apify.utils.sleep(1000);
28    const cookies = await page.cookies();
29    await browser.close();
30
31    const crawler = new Apify.PuppeteerCrawler({
32        requestList,
33        requestQueue,
34        useSessionPool: true,
35        proxyConfiguration,
36        launchPuppeteerOptions: {
37            userAgent,
38            stealth: true,
39        },
40        
41        // Here we apply our cookies from step 1.
42        gotoFunction: async ({ page, request, session, puppeteerPool }) => {
43            await page.setCookie(...cookies); 
44            
45            try {
46                return page.goto(request.url, {
47                    waitUntil: 'networkidle0',
48                });
49            } catch (e) {
50                session.retire();
51                await puppeteerPool.retire(page.browser());
52
53                throw e;
54            }
55        },
56
57        handlePageFunction: async ({ page, request }) => {
58            console.log(`Processing ${request.url}...`);
59
60            await Apify.utils.puppeteer.injectJQuery(page);
61
62            // Enqueue next page
63            try {
64                const nextHref = await page.$eval('.w-button-more a', el => el.href);
65                await requestQueue.addRequest({ url: nextHref });
66            } catch (err) {
67                console.log(`Url ${request.url} is the last page!`);
68            }
69
70            // Extract data.
71            const pageFunction = ($tweets) => {
72                const data = [];
73
74                $tweets.forEach(($tweet) => {
75                    data.push({
76                        username: $tweet.querySelector('.username').innerText,
77                        tweet: $tweet.querySelector('.tweet-text').innerText,
78                    });
79                });
80
81                return data;
82            };
83
84            const data = await page.$$eval('table.tweet', pageFunction);
85            
86            const userFnData = extendOutputFunction ? await page.evaluate(async (fn) => {
87                const result = eval(fn)(window.jQuery);
88                return typeof result === 'object' ? result : {};
89            }, extendOutputFunction): {};
90
91            await Apify.pushData(data.map((res) => ({ ...res, ...userFnData })));
92        },
93
94        handleFailedRequestFunction: async ({ request }) => {
95            console.log(`Request ${request.url} failed 4 times`);
96        },
97    });
98
99    await crawler.run();
100});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^0.21.0"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community
Categories