Telegram Channel Scraper avatar
Telegram Channel Scraper

Pricing

Pay per usage

Go to Store
Telegram Channel Scraper

Telegram Channel Scraper

Developed by

Daniel Milevski

Daniel Milevski

Maintained by Community

Telegram channel Scraper for getting public messages without login, phone number, or Telegram app. In the message supported forward and telegram reply URLs.

3.0 (1)

Pricing

Pay per usage

23

Total users

2.2k

Monthly users

69

Runs succeeded

>99%

Issue response

64 days

Last modified

2 years ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

package.json

{
"name": "telegram-channel-scraper",
"version": "0.0.1",
"type": "module",
"description": "This is a boilerplate of an Apify actor.",
"engines": {
"node": ">=16.0.0"
},
"dependencies": {
"apify": "^3.1.4",
"crawlee": "^3.3.2"
},
"devDependencies": {
"@apify/eslint-config": "^0.3.1",
"eslint": "^8.36.0"
},
"scripts": {
"start": "node src/main.js",
"lint": "eslint ./src --ext .js,.jsx",
"lint:fix": "eslint ./src --ext .js,.jsx --fix",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

.actor/actor.json

{
"actorSpecification": 1,
"name": "telegram-channel-scraper",
"title": "Project Cheerio Crawler Javascript",
"description": "Crawlee and Cheerio project in javascript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-cheerio"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile",
"storages": {
"dataset": {
"actorSpecification": 1,
"title": "",
"description": "",
"views": {}
}
}
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Run the image.
CMD npm start --silent

.actor/input_schema.json

{
"title": "CheerioCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"channels": {
"title": "Telegram channel(s)",
"type": "array",
"description": "Provide one or several Telegram channel names you want to scrape the posts from.",
"editor": "stringList",
"prefill": ["mediumcom"]
},
"postsFrom": {
"title": "Order number of the first post to scrape",
"type": "integer",
"description": "Channel posts numbered by auto increment from 1",
"editor": "number",
"prefill": 10,
"minimum": 1
},
"postsTo": {
"title": "Order number of the first post to scrape",
"type": "integer",
"description": "Channel posts numbered by auto increment from 1",
"editor": "number",
"prefill": 20,
"minimum": 1
},
"proxy": {
"title": "Proxy configuration",
"type": "object",
"description": "<strong>A proxy server is required to run this actor!</strong> Either use an Apify residential proxy, or provide your own proxy servers. Datacenter proxies will not work.",
"prefill": {
"useApifyProxy": true,
"apifyProxyGroups": ["RESIDENTIAL"]
},
"editor": "proxy",
"sectionCaption": "Proxy configuration",
"sectionDescription": "<strong>RESIDENTIAL proxy is required to run this actor!</strong> Other proxies will not work."
}
},
"required": ["channels"]
}

src/main.js

1import { Actor } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3import { router } from './routes.js';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8
9const {
10 channels = [],
11 postsFrom = 10,
12 postsTo = 20,
13 proxy = {
14 useApifyProxy: true,
15 apifyProxyGroups: ['RESIDENTIAL'],
16 },
17} = input;
18
19const startUrls = [];
20for (const channelName of channels) {
21 for (let i = Math.min(postsFrom, postsTo); i <= Math.max(postsFrom, postsTo); i++) {
22 startUrls.push({
23 url: `https://t.me/${channelName}/${i}?embed=1`,
24 userData: { channelName, id: i },
25 });
26 }
27}
28
29const proxyConfiguration = await Actor.createProxyConfiguration(proxy);
30
31const crawler = new CheerioCrawler({
32 proxyConfiguration,
33 requestHandler: router,
34});
35
36await crawler.run(startUrls);
37
38await Actor.exit();

src/routes.js

1import { Dataset, createCheerioRouter } from 'crawlee';
2
3export const router = createCheerioRouter();
4
5router.addDefaultHandler(async ({ request, $, log }) => {
6 const { userData: { channelName, id } } = request;
7 const error = $('div.tgme_widget_message_error, div.message_media_not_supported').text().trim();
8 if (error) {
9 log.error(error, request.userData);
10 return;
11 }
12 const author = $('a.tgme_widget_message_author_name').length ? $('a.tgme_widget_message_author_name') : $('a.tgme_widget_message_owner_name');
13 const forwarded = $('div.tgme_widget_message_forwarded_from a');
14 const post = {
15 id,
16 channelName,
17 authorName: author?.text(),
18 authorTelegram: author?.attr('href'),
19 repliedTo: $('a.tgme_widget_message_reply').attr('href'),
20 forwardedTitle: $(forwarded)?.text() || undefined,
21 forwardedFromUrl: $(forwarded)?.attr('href') || undefined,
22 date: $('a.tgme_widget_message_date time').attr('datetime'),
23 viewsCount: $('span.tgme_widget_message_views').text(),
24 text: $('div.tgme_widget_message_text').text(),
25 linkPreview: $('a.tgme_widget_message_link_preview').text().trim() || undefined,
26 };
27 await Dataset.pushData(post);
28});
29
30/*
31router.addHandler('comments', async ({ request, $, log }) => {
32});
33*/