GitHub repos search scraper
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsGitHub repos search scraper
strajk/github-repos-search-scraper
Given a search query (e.g. "Apify"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.
Dockerfile
1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6 && npm install --only=prod --no-optional
7
8COPY . ./
INPUT_SCHEMA.json
1{
2 "title": "GitHub repos search scraper",
3 "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
4 "type": "object",
5 "schemaVersion": 1,
6 "properties": {
7 "queries": {
8 "title": "Queries",
9 "description": "",
10 "type": "array",
11 "editor": "stringList"
12 },
13 "token": {
14 "title": "Token",
15 "description": "",
16 "type": "string",
17 "editor": "textfield"
18 },
19 "debug": {
20 "title": "Debug",
21 "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
22 "type": "boolean",
23 "editor": "checkbox",
24 "default": false
25 }
26 },
27 "required": [
28 "queries"
29 ]
30}
apify.json
1{
2 "name": "github-repos-search-scraper",
3 "version": "0.1",
4 "buildTag": "latest",
5 "env": null,
6 "defaultRunOptions": {
7 "build": "latest",
8 "timeoutSecs": 3600,
9 "memoryMbytes": 1024
10 }
11}
main.js
1/**
2 * TODOs
3 * ===
4 * - Better TS
5 * - Verbose-r logs in debug mode
6 * - Output schema
7 *
8 * */
9
10import { Octokit } from "@octokit/rest";
11import Apify from "apify";
12import { sleepUntil } from "./_utils/common";
13const { log } = Apify.utils;
14
15Apify.main(async () => {
16 const input = await Apify.getInput();
17 const {
18 queries = [`meteor`],
19 token, // eg: abc123
20 debug = false,
21 } = input ?? {};
22 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
23
24 if (!token)
25 log.warning(
26 `No token provided, will use anonymous access, which is severely limited and may cause rate limit issues.`
27 );
28
29 // Prepare request queue
30 const requestQueue = await Apify.openRequestQueue();
31 for (const query of queries) {
32 const requestLike = {
33 url: `https://dummy.com`, // compat with requestQueue
34 uniqueKey: query,
35 userData: {
36 query,
37 // `filter` prop added for paginated requests
38 },
39 };
40 await requestQueue.addRequest(requestLike);
41 }
42
43 // Init GitHub API
44 const octokit = new Octokit({ auth: token });
45
46 let rateLimitReset; // TODO: This should be probably saved on session?
47
48 const crawler = new Apify.BasicCrawler({
49 handleRequestTimeoutSecs: 60 * 2,
50 maxRequestRetries: 0,
51 requestQueue,
52 handleRequestFunction: async (context) => {
53 const { request } = context;
54 const results = [];
55 const { filter, query } = request.userData;
56 log.info(
57 `Processing query "${query}", specifying filter "${filter || ``}"`
58 );
59
60 let totalCount;
61 const q = filter ? `${query} stars:<${filter}` : query;
62
63 try {
64 // @ts-ignore
65 await octokit.paginate(
66 `GET /search/repositories`,
67 { q, per_page: 100 },
68 (response) => {
69 totalCount = response.data[`total_count`]; // can change on subsequent queries
70 // @ts-ignore
71 results.push(...response.data.map(pickRepo));
72 const rateLimitLimit = response.headers[`x-ratelimit-limit`]; // 10 without token
73 const rateLimitUsed = response.headers[`x-ratelimit-used`];
74 rateLimitReset = new Date(
75 parseInt(response.headers[`x-ratelimit-reset`]) * 1000
76 );
77 log.debug(
78 `Scraped: ${
79 results.length
80 } | Rate limit: ${rateLimitUsed}/${rateLimitLimit} (resets ${rateLimitReset.toISOString()}`
81 );
82 }
83 );
84 } catch (err) {
85 if (err.message.includes(`rate limit exceeded`)) {
86 const resetPlusSlightDelay = rateLimitReset
87 ? new Date(rateLimitReset.getTime() + 5000)
88 : new Date(new Date().getTime() + 65 * 1000); // after 65 seconds
89 log.warning(
90 `Rate limit exceeded, will retry at ${resetPlusSlightDelay.toISOString()}`
91 );
92 await sleepUntil(rateLimitReset);
93 } else {
94 throw err;
95 }
96 }
97
98 if (totalCount > results.length) {
99 // more results, we have to search for them by limiting search by stars
100 console.log(
101 `Total count ${totalCount} is higher than current count ${results.length}, continuing...`
102 );
103 const lastResult = results[results.length - 1];
104 const lastResultStars = lastResult.stars + 1; // +1 because we are filtering `less than stars`, not `less than and equal`
105 const nextFilter = Math.min(
106 (filter || Infinity) - 1, // either lower by one
107 lastResultStars // or use
108 );
109 if (nextFilter > 0) {
110 await requestQueue.addRequest({
111 url: request.url, // just pass dummy.com
112 userData: { query, filter: nextFilter },
113 uniqueKey: `${query}|${nextFilter}`,
114 });
115 }
116 }
117 await Apify.pushData(results);
118 },
119 });
120
121 await crawler.run();
122 log.info(`That's all folks!`);
123});
124
125function pickRepo(repo) {
126 return {
127 owner: repo.owner.login,
128 name: repo.name,
129 url: repo.html_url,
130 fork: repo.fork,
131 description: repo.description,
132 created_at: repo.created_at,
133 updated_at: repo.updated_at,
134 pushed_at: repo.pushed_at,
135 homepage: repo.homepage,
136 size: repo.size,
137 stars: repo.stargazers_count,
138 open_issues: repo.open_issues_count,
139 forks: repo.forks_count,
140 language: repo.language,
141 archived: repo.archived,
142 disabled: repo.disabled,
143 };
144}
package.json
1{
2 "name": "github-repos-search-scraper",
3 "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
4 "type": "module",
5 "scripts": {
6 "start": "node ./main.js",
7 "push-to-apify-platform": "npx apify push"
8 },
9 "dependencies": {
10 "@octokit/rest": "*",
11 "apify": "*"
12 },
13 "apify": {
14 "title": "GitHub repos search scraper",
15 "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
16 "isPublic": true,
17 "isDeprecated": false,
18 "isAnonymouslyRunnable": true,
19 "notice": "",
20 "pictureUrl": "",
21 "seoTitle": "",
22 "seoDescription": "",
23 "categories": [
24 "AUTOMATION"
25 ]
26 }
27}
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "github-repos-search-scraper",
4 "title": "GitHub repos search scraper",
5 "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
6 "version": "0.1.0",
7 "storages": {
8 "dataset": {
9 "actorSpecification": 1,
10 "title": "GitHub repos search scraper",
11 "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
12 "views": {
13 "overview": {
14 "title": "Overview",
15 "description": "Overview of the most important fields",
16 "transformation": {
17 "fields": [
18 "owner",
19 "name",
20 "url",
21 "fork",
22 "description",
23 "created_at",
24 "updated_at",
25 "pushed_at",
26 "homepage",
27 "size",
28 "stars",
29 "open_issues",
30 "forks",
31 "language",
32 "archived",
33 "disabled"
34 ]
35 },
36 "display": {
37 "component": "table",
38 "columns": [
39 {
40 "label": "Owner",
41 "field": "owner",
42 "format": "text"
43 },
44 {
45 "label": "Name",
46 "field": "name",
47 "format": "text"
48 },
49 {
50 "label": "Url",
51 "field": "url",
52 "format": "link"
53 },
54 {
55 "label": "Fork",
56 "field": "fork",
57 "format": "boolean"
58 },
59 {
60 "label": "Description",
61 "field": "description",
62 "format": "text"
63 },
64 {
65 "label": "Created_at",
66 "field": "created_at",
67 "format": "text"
68 },
69 {
70 "label": "Updated_at",
71 "field": "updated_at",
72 "format": "text"
73 },
74 {
75 "label": "Pushed_at",
76 "field": "pushed_at",
77 "format": "text"
78 },
79 {
80 "label": "Homepage",
81 "field": "homepage",
82 "format": "text"
83 },
84 {
85 "label": "Size",
86 "field": "size",
87 "format": "number"
88 },
89 {
90 "label": "Stars",
91 "field": "stars",
92 "format": "number"
93 },
94 {
95 "label": "Open_issues",
96 "field": "open_issues",
97 "format": "number"
98 },
99 {
100 "label": "Forks",
101 "field": "forks",
102 "format": "number"
103 },
104 {
105 "label": "Language",
106 "field": "language",
107 "format": "text"
108 },
109 {
110 "label": "Archived",
111 "field": "archived",
112 "format": "boolean"
113 },
114 {
115 "label": "Disabled",
116 "field": "disabled",
117 "format": "boolean"
118 }
119 ]
120 }
121 }
122 }
123 }
124 }
125}
_utils/common.js
1import { createHash } from 'crypto'
2
3// inspired by @drobnikj
4export const createUniqueKeyFromUrl = (url) => {
5 const hash = createHash(`sha256`)
6 const cleanUrl = url.split(`://`)[1] // Remove protocol
7 hash.update(cleanUrl)
8 return hash.digest(`hex`)
9}
10
11/**
12 *
13 * @param {Date} datetime
14 * @return {Promise<void>}
15 */
16export const sleepUntil = (datetime) => {
17 const now = new Date()
18 const difference = datetime - now
19 if (difference > 0) {
20 return new Promise((resolve) => {
21 setTimeout(resolve, difference)
22 })
23 }
24 return Promise.resolve()
25}
26
27export function parsePrice (string) {
28 let amount, currency
29 const noText = string.replace(/[^0-9.,]/g, ``)
30 const decimals = noText.match(/([.,])(\d{2})$/)
31 if (decimals) {
32 const decimalSeparator = decimals[1]
33 // eslint-disable-next-line @typescript-eslint/no-unused-vars
34 const decimalAmount = decimals[2]
35 amount = parseInt(noText.split(decimalSeparator)[0])
36 } {
37 const justNumbers = noText.replace(/[.,]/g, ``)
38 amount = parseInt(justNumbers)
39 }
40 return { amount, currency }
41}
Developer
Maintained by Community
Categories