Twitter Hashtag Scraper
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsTwitter Hashtag Scraper
mtrunkat/twitter
#️⃣ Scrape tweets by hashtag. Download your data as HTML table, JSON, CSV, Excel, XML, and RSS feed.
Dockerfile
1# Dockerfile contains instructions how to build a Docker image that
2# will contain all the code and configuration needed to run your actor.
3# For a full Dockerfile reference,
4# see https://docs.docker.com/engine/reference/builder/
5
6# First, specify the base Docker image. Apify provides the following
7# base images for your convenience:
8# apify/actor-node-basic (Node.js on Alpine Linux, small and fast)
9# apify/actor-node-chrome (Node.js + Chrome on Debian)
10# apify/actor-node-chrome-xvfb (Node.js + Chrome + Xvfb on Debian)
11# For more information, see https://docs.apify.com/actor/build#base-images
12# Note that you can use any other image from Docker Hub.
13FROM apify/actor-node-chrome
14
15# Second, copy just package.json since it should be the only file
16# that affects "npm install" in the next step, to speed up the build
17COPY package.json ./
18
19# Install NPM packages, skip optional and development dependencies to
20# keep the image small. Avoid logging too much and print the dependency
21# tree for debugging
22RUN npm --quiet set progress=false \
23 && npm install --only=prod --no-optional \
24 && echo "Installed NPM packages:" \
25 && npm list || true \
26 && echo "Node.js version:" \
27 && node --version \
28 && echo "NPM version:" \
29 && npm --version
30
31# Next, copy the remaining files and directories with the source code.
32# Since we do this after NPM install, quick build will be really fast
33# for most source file changes.
34COPY . ./
35
36# Optionally, specify how to launch the source code of your actor.
37# By default, Apify's base Docker images define the CMD instruction
38# that runs the Node.js source code using the command specified
39# in the "scripts.start" section of the package.json file.
40# In short, the instruction looks something like this:
41#
42# CMD npm start
INPUT_SCHEMA.json
1{
2 "title": "Twitter Hashtag",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "hashtag": {
7 "title": "Hashtag",
8 "type": "string",
9 "description": "Hashtag to search twitter",
10 "editor": "textfield"
11 },
12 "proxyConfig": {
13 "title": "Proxy configuration",
14 "type": "object",
15 "description": "Optionally use Apify Proxy",
16 "prefill": { "useApifyProxy": true, "apifyProxyGroups": ["SHADER"] },
17 "editor": "proxy"
18 },
19 "extendOutputFunction": {
20 "title": "Extend output function",
21 "type": "string",
22 "nullable": true,
23 "description": "Function that takes a JQuery handle ($) as argument and returns data that will be merged with the default output",
24 "prefill": "($) => { return {} }",
25 "editor": "javascript"
26 }
27 },
28 "required": [
29 "hashtag",
30 "proxyConfig"
31 ]
32}
main.js
1const Apify = require('apify');
2
3Apify.main(async () => {
4 const userAgent = 'Mozilla/5.0 (Linux; U; Android 3.2; nl-nl; GT-P6800 Build/HTJ85B) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13';
5 const { hashtag, extendOutputFunction, proxyConfig } = await Apify.getInput();
6
7 // Enqueue first URL.
8 const requestList = await Apify.openRequestList('hashtags', [
9 { url: `https://mobile.twitter.com/search?q=%23${hashtag}` }
10 ]);
11 const requestQueue = await Apify.openRequestQueue();
12 const proxyConfiguration = await Apify.createProxyConfiguration({
13 ...proxyConfig,
14 });
15 // Open twitter with JS disabled to ble able to switch to new version,
16 // save cookies (with switch to old Twitter version) and close the browser.
17 const browser = await Apify.launchPuppeteer({
18 proxyUrl: proxyConfiguration.newUrl(),
19 userAgent
20 });
21 const page = await browser.newPage();
22 await page.setJavaScriptEnabled(false);
23 await page.goto('https://mobile.twitter.com/home', {
24 waitUntil: 'networkidle0',
25 timeout: 30000
26 });
27 await Apify.utils.sleep(1000);
28 const cookies = await page.cookies();
29 await browser.close();
30
31 const crawler = new Apify.PuppeteerCrawler({
32 requestList,
33 requestQueue,
34 useSessionPool: true,
35 proxyConfiguration,
36 launchPuppeteerOptions: {
37 userAgent,
38 stealth: true,
39 },
40
41 // Here we apply our cookies from step 1.
42 gotoFunction: async ({ page, request, session, puppeteerPool }) => {
43 await page.setCookie(...cookies);
44
45 try {
46 return page.goto(request.url, {
47 waitUntil: 'networkidle0',
48 });
49 } catch (e) {
50 session.retire();
51 await puppeteerPool.retire(page.browser());
52
53 throw e;
54 }
55 },
56
57 handlePageFunction: async ({ page, request }) => {
58 console.log(`Processing ${request.url}...`);
59
60 await Apify.utils.puppeteer.injectJQuery(page);
61
62 // Enqueue next page
63 try {
64 const nextHref = await page.$eval('.w-button-more a', el => el.href);
65 await requestQueue.addRequest({ url: nextHref });
66 } catch (err) {
67 console.log(`Url ${request.url} is the last page!`);
68 }
69
70 // Extract data.
71 const pageFunction = ($tweets) => {
72 const data = [];
73
74 $tweets.forEach(($tweet) => {
75 data.push({
76 username: $tweet.querySelector('.username').innerText,
77 tweet: $tweet.querySelector('.tweet-text').innerText,
78 });
79 });
80
81 return data;
82 };
83
84 const data = await page.$$eval('table.tweet', pageFunction);
85
86 const userFnData = extendOutputFunction ? await page.evaluate(async (fn) => {
87 const result = eval(fn)(window.jQuery);
88 return typeof result === 'object' ? result : {};
89 }, extendOutputFunction): {};
90
91 await Apify.pushData(data.map((res) => ({ ...res, ...userFnData })));
92 },
93
94 handleFailedRequestFunction: async ({ request }) => {
95 console.log(`Request ${request.url} failed 4 times`);
96 },
97 });
98
99 await crawler.run();
100});
package.json
1{
2 "name": "my-actor",
3 "version": "0.0.1",
4 "dependencies": {
5 "apify": "^0.21.0"
6 },
7 "scripts": {
8 "start": "node main.js"
9 },
10 "author": "Me!"
11}
Developer
Maintained by Community
Categories