
GitHub repos search scraper
Deprecated
Pricing
Pay per usage
Go to Store

GitHub repos search scraper
Deprecated
Given a search query (e.g. "Apify"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Last modified
3 years ago
Dockerfile
FROM apify/actor-node:16
COPY package.json ./
RUN npm --quiet set progress=false \ && npm install --only=prod --no-optional
COPY . ./
INPUT_SCHEMA.json
{ "title": "GitHub repos search scraper", "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.", "type": "object", "schemaVersion": 1, "properties": { "queries": { "title": "Queries", "description": "", "type": "array", "editor": "stringList" }, "token": { "title": "Token", "description": "", "type": "string", "editor": "textfield" }, "debug": { "title": "Debug", "description": "Debug mode prints more logs, disables concurrency and other optimizations.", "type": "boolean", "editor": "checkbox", "default": false } }, "required": [ "queries" ]}
apify.json
{ "name": "github-repos-search-scraper", "version": "0.1", "buildTag": "latest", "env": null, "defaultRunOptions": { "build": "latest", "timeoutSecs": 3600, "memoryMbytes": 1024 }}
main.js
1/**2 * TODOs3 * ===4 * - Better TS5 * - Verbose-r logs in debug mode6 * - Output schema7 *8 * */9
10import { Octokit } from "@octokit/rest";11import Apify from "apify";12import { sleepUntil } from "./_utils/common";13const { log } = Apify.utils;14
15Apify.main(async () => {16 const input = await Apify.getInput();17 const {18 queries = [`meteor`],19 token, // eg: abc12320 debug = false,21 } = input ?? {};22 if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);23
24 if (!token)25 log.warning(26 `No token provided, will use anonymous access, which is severely limited and may cause rate limit issues.`27 );28
29 // Prepare request queue30 const requestQueue = await Apify.openRequestQueue();31 for (const query of queries) {32 const requestLike = {33 url: `https://dummy.com`, // compat with requestQueue34 uniqueKey: query,35 userData: {36 query,37 // `filter` prop added for paginated requests38 },39 };40 await requestQueue.addRequest(requestLike);41 }42
43 // Init GitHub API44 const octokit = new Octokit({ auth: token });45
46 let rateLimitReset; // TODO: This should be probably saved on session?47
48 const crawler = new Apify.BasicCrawler({49 handleRequestTimeoutSecs: 60 * 2,50 maxRequestRetries: 0,51 requestQueue,52 handleRequestFunction: async (context) => {53 const { request } = context;54 const results = [];55 const { filter, query } = request.userData;56 log.info(57 `Processing query "${query}", specifying filter "${filter || ``}"`58 );59
60 let totalCount;61 const q = filter ? `${query} stars:<${filter}` : query;62
63 try {64 // @ts-ignore65 await octokit.paginate(66 `GET /search/repositories`,67 { q, per_page: 100 },68 (response) => {69 totalCount = response.data[`total_count`]; // can change on subsequent queries70 // @ts-ignore71 results.push(...response.data.map(pickRepo));72 const rateLimitLimit = response.headers[`x-ratelimit-limit`]; // 10 without token73 const rateLimitUsed = response.headers[`x-ratelimit-used`];74 rateLimitReset = new Date(75 parseInt(response.headers[`x-ratelimit-reset`]) * 100076 );77 log.debug(78 `Scraped: ${79 results.length80 } | Rate limit: ${rateLimitUsed}/${rateLimitLimit} (resets ${rateLimitReset.toISOString()}`81 );82 }83 );84 } catch (err) {85 if (err.message.includes(`rate limit exceeded`)) {86 const resetPlusSlightDelay = rateLimitReset87 ? new Date(rateLimitReset.getTime() + 5000)88 : new Date(new Date().getTime() + 65 * 1000); // after 65 seconds89 log.warning(90 `Rate limit exceeded, will retry at ${resetPlusSlightDelay.toISOString()}`91 );92 await sleepUntil(rateLimitReset);93 } else {94 throw err;95 }96 }97
98 if (totalCount > results.length) {99 // more results, we have to search for them by limiting search by stars100 console.log(101 `Total count ${totalCount} is higher than current count ${results.length}, continuing...`102 );103 const lastResult = results[results.length - 1];104 const lastResultStars = lastResult.stars + 1; // +1 because we are filtering `less than stars`, not `less than and equal`105 const nextFilter = Math.min(106 (filter || Infinity) - 1, // either lower by one107 lastResultStars // or use108 );109 if (nextFilter > 0) {110 await requestQueue.addRequest({111 url: request.url, // just pass dummy.com112 userData: { query, filter: nextFilter },113 uniqueKey: `${query}|${nextFilter}`,114 });115 }116 }117 await Apify.pushData(results);118 },119 });120
121 await crawler.run();122 log.info(`That's all folks!`);123});124
125function pickRepo(repo) {126 return {127 owner: repo.owner.login,128 name: repo.name,129 url: repo.html_url,130 fork: repo.fork,131 description: repo.description,132 created_at: repo.created_at,133 updated_at: repo.updated_at,134 pushed_at: repo.pushed_at,135 homepage: repo.homepage,136 size: repo.size,137 stars: repo.stargazers_count,138 open_issues: repo.open_issues_count,139 forks: repo.forks_count,140 language: repo.language,141 archived: repo.archived,142 disabled: repo.disabled,143 };144}
package.json
{ "name": "github-repos-search-scraper", "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.", "type": "module", "scripts": { "start": "node ./main.js", "push-to-apify-platform": "npx apify push" }, "dependencies": { "@octokit/rest": "*", "apify": "*" }, "apify": { "title": "GitHub repos search scraper", "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.", "isPublic": true, "isDeprecated": false, "isAnonymouslyRunnable": true, "notice": "", "pictureUrl": "", "seoTitle": "", "seoDescription": "", "categories": [ "AUTOMATION" ] }}
.actor/actor.json
{ "actorSpecification": 1, "name": "github-repos-search-scraper", "title": "GitHub repos search scraper", "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.", "version": "0.1.0", "storages": { "dataset": { "actorSpecification": 1, "title": "GitHub repos search scraper", "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.", "views": { "overview": { "title": "Overview", "description": "Overview of the most important fields", "transformation": { "fields": [ "owner", "name", "url", "fork", "description", "created_at", "updated_at", "pushed_at", "homepage", "size", "stars", "open_issues", "forks", "language", "archived", "disabled" ] }, "display": { "component": "table", "columns": [ { "label": "Owner", "field": "owner", "format": "text" }, { "label": "Name", "field": "name", "format": "text" }, { "label": "Url", "field": "url", "format": "link" }, { "label": "Fork", "field": "fork", "format": "boolean" }, { "label": "Description", "field": "description", "format": "text" }, { "label": "Created_at", "field": "created_at", "format": "text" }, { "label": "Updated_at", "field": "updated_at", "format": "text" }, { "label": "Pushed_at", "field": "pushed_at", "format": "text" }, { "label": "Homepage", "field": "homepage", "format": "text" }, { "label": "Size", "field": "size", "format": "number" }, { "label": "Stars", "field": "stars", "format": "number" }, { "label": "Open_issues", "field": "open_issues", "format": "number" }, { "label": "Forks", "field": "forks", "format": "number" }, { "label": "Language", "field": "language", "format": "text" }, { "label": "Archived", "field": "archived", "format": "boolean" }, { "label": "Disabled", "field": "disabled", "format": "boolean" } ] } } } } }}
_utils/common.js
1import { createHash } from 'crypto'2
3// inspired by @drobnikj4export const createUniqueKeyFromUrl = (url) => {5 const hash = createHash(`sha256`)6 const cleanUrl = url.split(`://`)[1] // Remove protocol7 hash.update(cleanUrl)8 return hash.digest(`hex`)9}10
11/**12 *13 * @param {Date} datetime14 * @return {Promise<void>}15 */16export const sleepUntil = (datetime) => {17 const now = new Date()18 const difference = datetime - now19 if (difference > 0) {20 return new Promise((resolve) => {21 setTimeout(resolve, difference)22 })23 }24 return Promise.resolve()25}26
27export function parsePrice (string) {28 let amount, currency29 const noText = string.replace(/[^0-9.,]/g, ``)30 const decimals = noText.match(/([.,])(\d{2})$/)31 if (decimals) {32 const decimalSeparator = decimals[1]33 // eslint-disable-next-line @typescript-eslint/no-unused-vars34 const decimalAmount = decimals[2]35 amount = parseInt(noText.split(decimalSeparator)[0])36 } {37 const justNumbers = noText.replace(/[.,]/g, ``)38 amount = parseInt(justNumbers)39 }40 return { amount, currency }41}