GitHub repos search scraper avatar
GitHub repos search scraper
Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
GitHub repos search scraper

GitHub repos search scraper

strajk/github-repos-search-scraper

Given a search query (e.g. "Apify"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.

Dockerfile

1FROM apify/actor-node:16
2
3COPY package.json ./
4
5RUN npm --quiet set progress=false \
6  && npm install --only=prod --no-optional
7
8COPY . ./

INPUT_SCHEMA.json

1{
2  "title": "GitHub repos search scraper",
3  "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
4  "type": "object",
5  "schemaVersion": 1,
6  "properties": {
7    "queries": {
8      "title": "Queries",
9      "description": "",
10      "type": "array",
11      "editor": "stringList"
12    },
13    "token": {
14      "title": "Token",
15      "description": "",
16      "type": "string",
17      "editor": "textfield"
18    },
19    "debug": {
20      "title": "Debug",
21      "description": "Debug mode prints more logs, disables concurrency and other optimizations.",
22      "type": "boolean",
23      "editor": "checkbox",
24      "default": false
25    }
26  },
27  "required": [
28    "queries"
29  ]
30}

apify.json

1{
2  "name": "github-repos-search-scraper",
3  "version": "0.1",
4  "buildTag": "latest",
5  "env": null,
6  "defaultRunOptions": {
7    "build": "latest",
8    "timeoutSecs": 3600,
9    "memoryMbytes": 1024
10  }
11}

main.js

1/**
2 * TODOs
3 * ===
4 * - Better TS
5 * - Verbose-r logs in debug mode
6 * - Output schema
7 *
8 * */
9
10import { Octokit } from "@octokit/rest";
11import Apify from "apify";
12import { sleepUntil } from "./_utils/common";
13const { log } = Apify.utils;
14
15Apify.main(async () => {
16  const input = await Apify.getInput();
17  const {
18    queries = [`meteor`],
19    token, // eg: abc123
20    debug = false,
21  } = input ?? {};
22  if (debug) Apify.utils.log.setLevel(Apify.utils.log.LEVELS.DEBUG);
23
24  if (!token)
25    log.warning(
26      `No token provided, will use anonymous access, which is severely limited and may cause rate limit issues.`
27    );
28
29  // Prepare request queue
30  const requestQueue = await Apify.openRequestQueue();
31  for (const query of queries) {
32    const requestLike = {
33      url: `https://dummy.com`, // compat with requestQueue
34      uniqueKey: query,
35      userData: {
36        query,
37        // `filter` prop added for paginated requests
38      },
39    };
40    await requestQueue.addRequest(requestLike);
41  }
42
43  // Init GitHub API
44  const octokit = new Octokit({ auth: token });
45
46  let rateLimitReset; // TODO: This should be probably saved on session?
47
48  const crawler = new Apify.BasicCrawler({
49    handleRequestTimeoutSecs: 60 * 2,
50    maxRequestRetries: 0,
51    requestQueue,
52    handleRequestFunction: async (context) => {
53      const { request } = context;
54      const results = [];
55      const { filter, query } = request.userData;
56      log.info(
57        `Processing query "${query}", specifying filter "${filter || ``}"`
58      );
59
60      let totalCount;
61      const q = filter ? `${query} stars:<${filter}` : query;
62
63      try {
64        // @ts-ignore
65        await octokit.paginate(
66          `GET /search/repositories`,
67          { q, per_page: 100 },
68          (response) => {
69            totalCount = response.data[`total_count`]; // can change on subsequent queries
70            // @ts-ignore
71            results.push(...response.data.map(pickRepo));
72            const rateLimitLimit = response.headers[`x-ratelimit-limit`]; // 10 without token
73            const rateLimitUsed = response.headers[`x-ratelimit-used`];
74            rateLimitReset = new Date(
75              parseInt(response.headers[`x-ratelimit-reset`]) * 1000
76            );
77            log.debug(
78              `Scraped: ${
79                results.length
80              } | Rate limit: ${rateLimitUsed}/${rateLimitLimit} (resets ${rateLimitReset.toISOString()}`
81            );
82          }
83        );
84      } catch (err) {
85        if (err.message.includes(`rate limit exceeded`)) {
86          const resetPlusSlightDelay = rateLimitReset
87            ? new Date(rateLimitReset.getTime() + 5000)
88            : new Date(new Date().getTime() + 65 * 1000); // after 65 seconds
89          log.warning(
90            `Rate limit exceeded, will retry at ${resetPlusSlightDelay.toISOString()}`
91          );
92          await sleepUntil(rateLimitReset);
93        } else {
94          throw err;
95        }
96      }
97
98      if (totalCount > results.length) {
99        // more results, we have to search for them by limiting search by stars
100        console.log(
101          `Total count ${totalCount} is higher than current count ${results.length}, continuing...`
102        );
103        const lastResult = results[results.length - 1];
104        const lastResultStars = lastResult.stars + 1; // +1 because we are filtering `less than stars`, not `less than and equal`
105        const nextFilter = Math.min(
106          (filter || Infinity) - 1, // either lower by one
107          lastResultStars // or use
108        );
109        if (nextFilter > 0) {
110          await requestQueue.addRequest({
111            url: request.url, // just pass dummy.com
112            userData: { query, filter: nextFilter },
113            uniqueKey: `${query}|${nextFilter}`,
114          });
115        }
116      }
117      await Apify.pushData(results);
118    },
119  });
120
121  await crawler.run();
122  log.info(`That's all folks!`);
123});
124
125function pickRepo(repo) {
126  return {
127    owner: repo.owner.login,
128    name: repo.name,
129    url: repo.html_url,
130    fork: repo.fork,
131    description: repo.description,
132    created_at: repo.created_at,
133    updated_at: repo.updated_at,
134    pushed_at: repo.pushed_at,
135    homepage: repo.homepage,
136    size: repo.size,
137    stars: repo.stargazers_count,
138    open_issues: repo.open_issues_count,
139    forks: repo.forks_count,
140    language: repo.language,
141    archived: repo.archived,
142    disabled: repo.disabled,
143  };
144}

package.json

1{
2  "name": "github-repos-search-scraper",
3  "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
4  "type": "module",
5  "scripts": {
6    "start": "node ./main.js",
7    "push-to-apify-platform": "npx apify push"
8  },
9  "dependencies": {
10    "@octokit/rest": "*",
11    "apify": "*"
12  },
13  "apify": {
14    "title": "GitHub repos search scraper",
15    "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
16    "isPublic": true,
17    "isDeprecated": false,
18    "isAnonymouslyRunnable": true,
19    "notice": "",
20    "pictureUrl": "",
21    "seoTitle": "",
22    "seoDescription": "",
23    "categories": [
24      "AUTOMATION"
25    ]
26  }
27}

.actor/actor.json

1{
2  "actorSpecification": 1,
3  "name": "github-repos-search-scraper",
4  "title": "GitHub repos search scraper",
5  "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
6  "version": "0.1.0",
7  "storages": {
8    "dataset": {
9      "actorSpecification": 1,
10      "title": "GitHub repos search scraper",
11      "description": "Given a search query (e.g. \"Apify\"), scrapes all repos from GitHub containing that query in title or description. It's not limited to the first 1000 results as the official API is.",
12      "views": {
13        "overview": {
14          "title": "Overview",
15          "description": "Overview of the most important fields",
16          "transformation": {
17            "fields": [
18              "owner",
19              "name",
20              "url",
21              "fork",
22              "description",
23              "created_at",
24              "updated_at",
25              "pushed_at",
26              "homepage",
27              "size",
28              "stars",
29              "open_issues",
30              "forks",
31              "language",
32              "archived",
33              "disabled"
34            ]
35          },
36          "display": {
37            "component": "table",
38            "columns": [
39              {
40                "label": "Owner",
41                "field": "owner",
42                "format": "text"
43              },
44              {
45                "label": "Name",
46                "field": "name",
47                "format": "text"
48              },
49              {
50                "label": "Url",
51                "field": "url",
52                "format": "link"
53              },
54              {
55                "label": "Fork",
56                "field": "fork",
57                "format": "boolean"
58              },
59              {
60                "label": "Description",
61                "field": "description",
62                "format": "text"
63              },
64              {
65                "label": "Created_at",
66                "field": "created_at",
67                "format": "text"
68              },
69              {
70                "label": "Updated_at",
71                "field": "updated_at",
72                "format": "text"
73              },
74              {
75                "label": "Pushed_at",
76                "field": "pushed_at",
77                "format": "text"
78              },
79              {
80                "label": "Homepage",
81                "field": "homepage",
82                "format": "text"
83              },
84              {
85                "label": "Size",
86                "field": "size",
87                "format": "number"
88              },
89              {
90                "label": "Stars",
91                "field": "stars",
92                "format": "number"
93              },
94              {
95                "label": "Open_issues",
96                "field": "open_issues",
97                "format": "number"
98              },
99              {
100                "label": "Forks",
101                "field": "forks",
102                "format": "number"
103              },
104              {
105                "label": "Language",
106                "field": "language",
107                "format": "text"
108              },
109              {
110                "label": "Archived",
111                "field": "archived",
112                "format": "boolean"
113              },
114              {
115                "label": "Disabled",
116                "field": "disabled",
117                "format": "boolean"
118              }
119            ]
120          }
121        }
122      }
123    }
124  }
125}

_utils/common.js

1import { createHash } from 'crypto'
2
3// inspired by @drobnikj
4export const createUniqueKeyFromUrl = (url) => {
5  const hash = createHash(`sha256`)
6  const cleanUrl = url.split(`://`)[1] // Remove protocol
7  hash.update(cleanUrl)
8  return hash.digest(`hex`)
9}
10
11/**
12 *
13 * @param {Date} datetime
14 * @return {Promise<void>}
15 */
16export const sleepUntil = (datetime) => {
17  const now = new Date()
18  const difference = datetime - now
19  if (difference > 0) {
20    return new Promise((resolve) => {
21      setTimeout(resolve, difference)
22    })
23  }
24  return Promise.resolve()
25}
26
27export function parsePrice (string) {
28  let amount, currency
29  const noText = string.replace(/[^0-9.,]/g, ``)
30  const decimals = noText.match(/([.,])(\d{2})$/)
31  if (decimals) {
32    const decimalSeparator = decimals[1]
33    // eslint-disable-next-line @typescript-eslint/no-unused-vars
34    const decimalAmount = decimals[2]
35    amount = parseInt(noText.split(decimalSeparator)[0])
36  } {
37    const justNumbers = noText.replace(/[.,]/g, ``)
38    amount = parseInt(justNumbers)
39  }
40  return { amount, currency }
41}
Developer
Maintained by Community
Categories