Telegram Channel Scraper
Pricing
Pay per usage
Go to Store
Telegram Channel Scraper
Telegram channel Scraper for getting public messages without login, phone number, or Telegram app. In the message supported forward and telegram reply URLs.
3.0 (1)
Pricing
Pay per usage
23
Total users
2.2k
Monthly users
69
Runs succeeded
>99%
Issue response
64 days
Last modified
2 years ago
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
# Added by Apify CLI.venv
package.json
{ "name": "telegram-channel-scraper", "version": "0.0.1", "type": "module", "description": "This is a boilerplate of an Apify actor.", "engines": { "node": ">=16.0.0" }, "dependencies": { "apify": "^3.1.4", "crawlee": "^3.3.2" }, "devDependencies": { "@apify/eslint-config": "^0.3.1", "eslint": "^8.36.0" }, "scripts": { "start": "node src/main.js", "lint": "eslint ./src --ext .js,.jsx", "lint:fix": "eslint ./src --ext .js,.jsx --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}
.actor/actor.json
{ "actorSpecification": 1, "name": "telegram-channel-scraper", "title": "Project Cheerio Crawler Javascript", "description": "Crawlee and Cheerio project in javascript.", "version": "0.0", "meta": { "templateId": "js-crawlee-cheerio" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": { "actorSpecification": 1, "title": "", "description": "", "views": {} } }}
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:16
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm start --silent
.actor/input_schema.json
{ "title": "CheerioCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "channels": { "title": "Telegram channel(s)", "type": "array", "description": "Provide one or several Telegram channel names you want to scrape the posts from.", "editor": "stringList", "prefill": ["mediumcom"] }, "postsFrom": { "title": "Order number of the first post to scrape", "type": "integer", "description": "Channel posts numbered by auto increment from 1", "editor": "number", "prefill": 10, "minimum": 1 }, "postsTo": { "title": "Order number of the first post to scrape", "type": "integer", "description": "Channel posts numbered by auto increment from 1", "editor": "number", "prefill": 20, "minimum": 1 }, "proxy": { "title": "Proxy configuration", "type": "object", "description": "<strong>A proxy server is required to run this actor!</strong> Either use an Apify residential proxy, or provide your own proxy servers. Datacenter proxies will not work.", "prefill": { "useApifyProxy": true, "apifyProxyGroups": ["RESIDENTIAL"] }, "editor": "proxy", "sectionCaption": "Proxy configuration", "sectionDescription": "<strong>RESIDENTIAL proxy is required to run this actor!</strong> Other proxies will not work." } }, "required": ["channels"]}
src/main.js
1import { Actor } from 'apify';2import { CheerioCrawler } from 'crawlee';3import { router } from './routes.js';4
5await Actor.init();6
7const input = await Actor.getInput();8
9const {10 channels = [],11 postsFrom = 10,12 postsTo = 20,13 proxy = {14 useApifyProxy: true,15 apifyProxyGroups: ['RESIDENTIAL'],16 },17} = input;18
19const startUrls = [];20for (const channelName of channels) {21 for (let i = Math.min(postsFrom, postsTo); i <= Math.max(postsFrom, postsTo); i++) {22 startUrls.push({23 url: `https://t.me/${channelName}/${i}?embed=1`,24 userData: { channelName, id: i },25 });26 }27}28
29const proxyConfiguration = await Actor.createProxyConfiguration(proxy);30
31const crawler = new CheerioCrawler({32 proxyConfiguration,33 requestHandler: router,34});35
36await crawler.run(startUrls);37
38await Actor.exit();
src/routes.js
1import { Dataset, createCheerioRouter } from 'crawlee';2
3export const router = createCheerioRouter();4
5router.addDefaultHandler(async ({ request, $, log }) => {6 const { userData: { channelName, id } } = request;7 const error = $('div.tgme_widget_message_error, div.message_media_not_supported').text().trim();8 if (error) {9 log.error(error, request.userData);10 return;11 }12 const author = $('a.tgme_widget_message_author_name').length ? $('a.tgme_widget_message_author_name') : $('a.tgme_widget_message_owner_name');13 const forwarded = $('div.tgme_widget_message_forwarded_from a');14 const post = {15 id,16 channelName,17 authorName: author?.text(),18 authorTelegram: author?.attr('href'),19 repliedTo: $('a.tgme_widget_message_reply').attr('href'),20 forwardedTitle: $(forwarded)?.text() || undefined,21 forwardedFromUrl: $(forwarded)?.attr('href') || undefined,22 date: $('a.tgme_widget_message_date time').attr('datetime'),23 viewsCount: $('span.tgme_widget_message_views').text(),24 text: $('div.tgme_widget_message_text').text(),25 linkPreview: $('a.tgme_widget_message_link_preview').text().trim() || undefined,26 };27 await Dataset.pushData(post);28});29
30/*31router.addHandler('comments', async ({ request, $, log }) => {32});33*/