Meta Threads Scraper

No credit card required

This Actor is under maintenance.

This actor is under maintenance and it may unreliable.

Meta Threads Scraper

Meta Threads Scraper

tiger_king/meta-threads-scraper

No credit card required

Scrap a specific thread by input a thread url. Such as "https://www.threads.net/t/CuZsgfWLyiI".

.actor/Dockerfile

1# Specify the base Docker image. You can read more about 2# the available images at https://crawlee.dev/docs/guides/docker-images 3# You can also use any other image from Docker Hub. 4FROM apify/actor-node-playwright-chrome:16 5 6# Copy just package.json and package-lock.json 7# to speed up the build using Docker layer cache. 8COPY --chown=myuser package*.json ./ 9 10# Install NPM packages, skip optional and development dependencies to 11# keep the image small. Avoid logging too much and print the dependency 12# tree for debugging 13RUN npm --quiet set progress=false \ 14 && npm install --omit=dev --omit=optional \ 15 && echo "Installed NPM packages:" \ 16 && (npm list --omit=dev --all || true) \ 17 && echo "Node.js version:" \ 18 && node --version \ 19 && echo "NPM version:" \ 20 && npm --version \ 21 && rm -r ~/.npm 22 23# Next, copy the remaining files and directories with the source code. 24# Since we do this after NPM install, quick build will be really fast 25# for most source file changes. 26COPY --chown=myuser . ./ 27 28 29# Run the image. If you know you won't need headful browsers, 30# you can remove the XVFB start script for a micro perf gain. 31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent 32

.actor/actor.json

1{ 2 "actorSpecification": 1, 3 "name": "my-actor", 4 "title": "Project Playwright Crawler JavaScript", 5 "description": "Crawlee and Playwright project in JavaScript.", 6 "version": "0.0", 7 "meta": { 8 "templateId": "js-crawlee-playwright-chrome" 9 }, 10 "input": "./input_schema.json", 11 "dockerfile": "./Dockerfile", 12 "storages": { 13 "dataset": { 14 "actorSpecification": 1, 15 "views": { 16 "overview": { 17 "title": "Result", 18 "transformation": { 19 "fields": [ 20 "containing_thread", 21 "reply_threads" 22 ] 23 }, 24 "display": { 25 "component": "table", 26 "properties": { 27 "containing_thread": { 28 "label": "containing_thread", 29 "format": "array" 30 }, 31 "reply_threads": { 32 "label": "containing_thread", 33 "format": "array" 34 } 35 } 36 } 37 } 38 } 39 } 40 } 41} 42

.actor/input_schema.json

1{ 2 "title": "PlaywrightCrawler Template", 3 "type": "object", 4 "schemaVersion": 1, 5 "properties": { 6 "target": { 7 "title": "target Threads URLs", 8 "type": "string", 9 "description": "URLs to crawl", 10 "editor": "textfield", 11 "default": "https://www.threads.net/t/CuZsgfWLyiI" 12 } 13 } 14} 15

src/main.js

1/** 2 * This template is a production ready boilerplate for developing with `PlaywrightCrawler`. 3 * Use this to bootstrap your projects using the most up-to-date code. 4 * If you're looking for examples or want to learn more, see README. 5 */ 6 7// For more information, see https://docs.apify.com/sdk/js 8import { Actor } from 'apify'; 9// For more information, see https://crawlee.dev 10import { PlaywrightCrawler, RequestQueue } from 'crawlee'; 11 12// Initialize the Apify SDK 13await Actor.init(); 14const input = await Actor.getInput(); 15console.log(111, JSON.stringify(input)) 16 17const requestQueue = await RequestQueue.open(); 18await requestQueue.addRequest({ url: input.target || '' }); 19console.log(222) 20 21const proxyConfiguration = await Actor.createProxyConfiguration(); 22const crawler = new PlaywrightCrawler({ 23 proxyConfiguration, 24 requestQueue, 25 async requestHandler({ request, page, log }) { 26 log.info(`Processing ${request.url}...`); 27 const title = await page.title(); 28 log.info(`${title}`, { url: request.loadedUrl }); 29 page.on('response', async req => { 30 console.log(req.url()) 31 if (req.url() === 'https://www.threads.net/api/graphql') { 32 const data = await req.json() 33 log.info(JSON.stringify()) 34 await Actor.pushData(data.data.data) 35 console.log('success, waiting to exit') 36 await Actor.exit(); 37 } 38 }); 39 await page.waitForTimeout(6000) 40 }, 41}); 42 43await crawler.run(); 44 45// Exit successfully 46await Actor.exit(); 47

src/routes.js

1import { Dataset, createPlaywrightRouter } from 'crawlee'; 2 3export const router = createPlaywrightRouter(); 4 5router.addDefaultHandler(async ({ enqueueLinks, log }) => { 6 log.info(`enqueueing new URLs`); 7 await enqueueLinks({ 8 globs: ['https://apify.com/*'], 9 label: 'detail', 10 }); 11}); 12 13router.addHandler('detail', async ({ request, page, log }) => { 14 const title = await page.title(); 15 log.info(`${title}`, { url: request.loadedUrl }); 16 17 await Dataset.pushData({ 18 url: request.loadedUrl, 19 title, 20 }); 21}); 22

.dockerignore

1# configurations 2.idea 3 4# crawlee and apify storage folders 5apify_storage 6crawlee_storage 7storage 8 9# installed files 10node_modules 11 12# git folder 13.git 14

.editorconfig

1root = true 2 3[*] 4indent_style = space 5indent_size = 4 6charset = utf-8 7trim_trailing_whitespace = true 8insert_final_newline = true 9end_of_line = lf 10

.eslintrc

1{ 2 "extends": "@apify", 3 "root": true 4} 5

.gitignore

1# This file tells Git which files shouldn't be added to source control 2 3.DS_Store 4.idea 5dist 6node_modules 7apify_storage 8storage 9

package.json

1{ 2 "name": "crawlee-playwright-javascript", 3 "version": "0.0.1", 4 "type": "module", 5 "description": "This is an example of an Apify actor.", 6 "dependencies": { 7 "apify": "^3.0.0", 8 "crawlee": "^3.0.0", 9 "playwright": "*" 10 }, 11 "devDependencies": { 12 "@apify/eslint-config": "^0.3.1", 13 "eslint": "^8.36.0" 14 }, 15 "scripts": { 16 "start": "node src/main.js", 17 "lint": "eslint ./src --ext .js,.jsx", 18 "lint:fix": "eslint ./src --ext .js,.jsx --fix", 19 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" 20 }, 21 "author": "It's not you it's me", 22 "license": "ISC" 23} 24
Developer
Maintained by Community
Actor stats
  • 23 users
  • 256 runs
  • Modified 5 months ago

You might also like these Actors