Telegram Channel Scraper avatar
Telegram Channel Scraper
Try for free

No credit card required

View all Actors
Telegram Channel Scraper

Telegram Channel Scraper

danielmilevski9/telegram-channel-scraper
Try for free

No credit card required

Telegram channel Scraper for getting public messages without login, phone number, or Telegram app. In the message supported forward and telegram reply URLs.

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

package.json

1{
2	"name": "telegram-channel-scraper",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is a boilerplate of an Apify actor.",
6	"engines": {
7		"node": ">=16.0.0"
8	},
9	"dependencies": {
10		"apify": "^3.1.4",
11		"crawlee": "^3.3.2"
12	},
13	"devDependencies": {
14		"@apify/eslint-config": "^0.3.1",
15		"eslint": "^8.36.0"
16	},
17	"scripts": {
18		"start": "node src/main.js",
19		"lint": "eslint ./src --ext .js,.jsx",
20		"lint:fix": "eslint ./src --ext .js,.jsx --fix",
21		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
22	},
23	"author": "It's not you it's me",
24	"license": "ISC"
25}

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "telegram-channel-scraper",
4	"title": "Project Cheerio Crawler Javascript",
5	"description": "Crawlee and Cheerio project in javascript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-cheerio"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile",
12	"storages": {
13        "dataset": {
14            "actorSpecification": 1,
15            "title": "",
16            "description": "",
17            "views": {}
18        }
19    }
20}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm start --silent

.actor/input_schema.json

1{
2    "title": "CheerioCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "channels": {
7            "title": "Telegram channel(s)",
8            "type": "array",
9            "description": "Provide one or several Telegram channel names you want to scrape the posts from.",
10            "editor": "stringList",
11            "prefill": ["mediumcom"]
12        },
13        "postsFrom": {
14            "title": "Order number of the first post to scrape",
15            "type": "integer",
16            "description": "Channel posts numbered by auto increment from 1",
17            "editor": "number",
18            "prefill": 10,
19            "minimum": 1
20        },
21        "postsTo": {
22            "title": "Order number of the first post to scrape",
23            "type": "integer",
24            "description": "Channel posts numbered by auto increment from 1",
25            "editor": "number",
26            "prefill": 20,
27            "minimum": 1
28        },
29        "proxy": {
30            "title": "Proxy configuration",
31            "type": "object",
32            "description": "<strong>A proxy server is required to run this actor!</strong> Either use an Apify residential proxy, or provide your own proxy servers. Datacenter proxies will not work.",
33            "prefill": {
34                "useApifyProxy": true,
35                "apifyProxyGroups": ["RESIDENTIAL"]
36            },
37            "editor": "proxy",
38            "sectionCaption": "Proxy configuration",
39            "sectionDescription": "<strong>RESIDENTIAL proxy is required to run this actor!</strong> Other proxies will not work."
40        }
41    },
42    "required": ["channels"]
43}

src/main.js

1import { Actor } from 'apify';
2import { CheerioCrawler } from 'crawlee';
3import { router } from './routes.js';
4
5await Actor.init();
6
7const input = await Actor.getInput();
8
9const {
10    channels = [],
11    postsFrom = 10,
12    postsTo = 20,
13    proxy = {
14        useApifyProxy: true,
15        apifyProxyGroups: ['RESIDENTIAL'],
16    },
17} = input;
18
19const startUrls = [];
20for (const channelName of channels) {
21    for (let i = Math.min(postsFrom, postsTo); i <= Math.max(postsFrom, postsTo); i++) {
22        startUrls.push({
23            url: `https://t.me/${channelName}/${i}?embed=1`,
24            userData: { channelName, id: i },
25        });
26    }
27}
28
29const proxyConfiguration = await Actor.createProxyConfiguration(proxy);
30
31const crawler = new CheerioCrawler({
32    proxyConfiguration,
33    requestHandler: router,
34});
35
36await crawler.run(startUrls);
37
38await Actor.exit();

src/routes.js

1import { Dataset, createCheerioRouter } from 'crawlee';
2
3export const router = createCheerioRouter();
4
5router.addDefaultHandler(async ({ request, $, log }) => {
6    const { userData: { channelName, id } } = request;
7    const error = $('div.tgme_widget_message_error, div.message_media_not_supported').text().trim();
8    if (error) {
9        log.error(error, request.userData);
10        return;
11    }
12    const author = $('a.tgme_widget_message_author_name').length ? $('a.tgme_widget_message_author_name') : $('a.tgme_widget_message_owner_name');
13    const forwarded = $('div.tgme_widget_message_forwarded_from a');
14    const post = {
15        id,
16        channelName,
17        authorName: author?.text(),
18        authorTelegram: author?.attr('href'),
19        repliedTo: $('a.tgme_widget_message_reply').attr('href'),
20        forwardedTitle: $(forwarded)?.text() || undefined,
21        forwardedFromUrl: $(forwarded)?.attr('href') || undefined,
22        date: $('a.tgme_widget_message_date time').attr('datetime'),
23        viewsCount: $('span.tgme_widget_message_views').text(),
24        text: $('div.tgme_widget_message_text').text(),
25        linkPreview: $('a.tgme_widget_message_link_preview').text().trim() || undefined,
26    };
27    await Dataset.pushData(post);
28});
29
30/*
31router.addHandler('comments', async ({ request, $, log }) => {
32});
33*/
Developer
Maintained by Community
Actor metrics
  • 114 monthly users
  • 3 stars
  • 99.8% runs succeeded
  • Created in Jun 2023
  • Modified about 1 year ago
Categories