Meta Threads Scraper avatar

Meta Threads Scraper

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Meta Threads Scraper

Meta Threads Scraper

tiger_king/meta-threads-scraper

Scrap a specific thread by input a thread url. Such as "https://www.threads.net/t/CuZsgfWLyiI".

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-playwright-chrome:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Project Playwright Crawler JavaScript",
5    "description": "Crawlee and Playwright project in JavaScript.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "js-crawlee-playwright-chrome"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile",
12    "storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "views": {
16                "overview": {
17                    "title": "Result",
18                    "transformation": {
19                        "fields": [
20                            "containing_thread",
21                            "reply_threads"
22                        ]
23                    },
24                    "display": {
25                        "component": "table",
26                        "properties": {
27                            "containing_thread": {
28                                "label": "containing_thread",
29                                "format": "array"
30                            },
31                            "reply_threads": {
32                                "label": "containing_thread",
33                                "format": "array"
34                            }
35                        }
36                    }
37                }
38            }
39        }
40    }
41}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "target": {
7            "title": "target Threads URLs",
8            "type": "string",
9            "description": "URLs to crawl",
10            "editor": "textfield",
11            "default": "https://www.threads.net/t/CuZsgfWLyiI"
12        }
13    }
14}

src/main.js

1/**
2 * This template is a production ready boilerplate for developing with `PlaywrightCrawler`.
3 * Use this to bootstrap your projects using the most up-to-date code.
4 * If you're looking for examples or want to learn more, see README.
5 */
6
7// For more information, see https://docs.apify.com/sdk/js
8import { Actor } from 'apify';
9// For more information, see https://crawlee.dev
10import { PlaywrightCrawler, RequestQueue } from 'crawlee';
11
12// Initialize the Apify SDK
13await Actor.init();
14const input = await Actor.getInput();
15console.log(111, JSON.stringify(input))
16
17const requestQueue = await RequestQueue.open();
18await requestQueue.addRequest({ url: input.target || '' });
19console.log(222)
20
21const proxyConfiguration = await Actor.createProxyConfiguration();
22const crawler = new PlaywrightCrawler({
23    proxyConfiguration,
24    requestQueue,
25    async requestHandler({ request, page, log }) {
26        log.info(`Processing ${request.url}...`);
27        const title = await page.title();
28        log.info(`${title}`, { url: request.loadedUrl });
29        page.on('response', async req => {
30            console.log(req.url())
31            if (req.url() === 'https://www.threads.net/api/graphql') {
32                const data = await req.json()
33                log.info(JSON.stringify())
34                await Actor.pushData(data.data.data)
35                console.log('success, waiting to exit')
36                await Actor.exit();
37            }
38        });
39        await page.waitForTimeout(6000)
40    },
41});
42
43await crawler.run();
44
45// Exit successfully
46await Actor.exit();

src/routes.js

1import { Dataset, createPlaywrightRouter } from 'crawlee';
2
3export const router = createPlaywrightRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://apify.com/*'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14    const title = await page.title();
15    log.info(`${title}`, { url: request.loadedUrl });
16
17    await Dataset.pushData({
18        url: request.loadedUrl,
19        title,
20    });
21});

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2    "name": "crawlee-playwright-javascript",
3    "version": "0.0.1",
4    "type": "module",
5    "description": "This is an example of an Apify actor.",
6    "dependencies": {
7        "apify": "^3.0.0",
8        "crawlee": "^3.0.0",
9        "playwright": "*"
10    },
11    "devDependencies": {
12        "@apify/eslint-config": "^0.3.1",
13        "eslint": "^8.36.0"
14    },
15    "scripts": {
16        "start": "node src/main.js",
17        "lint": "eslint ./src --ext .js,.jsx",
18        "lint:fix": "eslint ./src --ext .js,.jsx --fix",
19        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
20    },
21    "author": "It's not you it's me",
22    "license": "ISC"
23}
Developer
Maintained by Community