1"""GitHub Repository Search & Scraper — Apify Actor.
2
3Uses the official GitHub REST API v3 search endpoint:
4 https://api.github.com/search/repositories
5
6Field shape verified against a live API response. Relevant per-item fields:
7 full_name, name, description, html_url, stargazers_count, forks_count,
8 open_issues_count, watchers_count, language, topics, license{spdx_id,name},
9 owner{login,type,html_url}, created_at, updated_at, pushed_at, homepage,
10 archived, fork, default_branch, size
11
12Auth: optional GitHub token. Unauthenticated search is limited to ~10 req/min;
13authenticated is ~30 req/min and 5000 req/hr core. The user supplies their own
14token as a secret input — we never store or transmit it anywhere but GitHub.
15"""
16
17from __future__ import annotations
18
19import asyncio
20from urllib.parse import urlencode
21
22import httpx
23from apify import Actor
24
25SEARCH_URL = "https://api.github.com/search/repositories"
26GITHUB_MAX_RESULTS = 1000
27PAGE_SIZE = 100
28
29
30def build_search_q(actor_input: dict) -> str:
31 """Assemble the GitHub search `q` string from structured inputs.
32
33 GitHub uses qualifiers like `language:python stars:>100 topic:cli`.
34 """
35 parts: list[str] = []
36
37 query = (actor_input.get("query") or "").strip()
38 if query:
39 parts.append(query)
40
41 language = (actor_input.get("language") or "").strip()
42 if language:
43 parts.append(f"language:{language}")
44
45 topic = (actor_input.get("topic") or "").strip()
46 if topic:
47
48 for t in (x.strip() for x in topic.split(",") if x.strip()):
49 parts.append(f"topic:{t}")
50
51 min_stars = actor_input.get("minStars")
52 if isinstance(min_stars, int):
53 parts.append(f"stars:>={min_stars}")
54
55 user = (actor_input.get("user") or "").strip()
56 if user:
57 parts.append(f"user:{user}")
58
59 pushed_after = (actor_input.get("pushedAfter") or "").strip()
60 if pushed_after:
61 parts.append(f"pushed:>={pushed_after}")
62
63 created_after = (actor_input.get("createdAfter") or "").strip()
64 if created_after:
65 parts.append(f"created:>={created_after}")
66
67 if actor_input.get("excludeForks"):
68 parts.append("fork:false")
69 if actor_input.get("excludeArchived"):
70 parts.append("archived:false")
71
72 return " ".join(parts).strip()
73
74
75def transform_repo(repo: dict) -> dict:
76 """Convert a raw GitHub repo object into a clean, stable output record."""
77 owner = repo.get("owner") or {}
78 license_info = repo.get("license") or {}
79 return {
80 "fullName": repo.get("full_name"),
81 "name": repo.get("name"),
82 "description": repo.get("description"),
83 "url": repo.get("html_url"),
84 "homepage": repo.get("homepage"),
85 "owner": owner.get("login"),
86 "ownerType": owner.get("type"),
87 "ownerUrl": owner.get("html_url"),
88 "stars": repo.get("stargazers_count"),
89 "forks": repo.get("forks_count"),
90 "watchers": repo.get("watchers_count"),
91 "openIssues": repo.get("open_issues_count"),
92 "language": repo.get("language"),
93 "topics": repo.get("topics") or [],
94 "license": license_info.get("spdx_id"),
95 "licenseName": license_info.get("name"),
96 "isFork": repo.get("fork"),
97 "isArchived": repo.get("archived"),
98 "defaultBranch": repo.get("default_branch"),
99 "sizeKb": repo.get("size"),
100 "createdAt": repo.get("created_at"),
101 "updatedAt": repo.get("updated_at"),
102 "pushedAt": repo.get("pushed_at"),
103 }
104
105
106async def main() -> None:
107 async with Actor:
108 actor_input = await Actor.get_input() or {}
109
110 sort = actor_input.get("sort", "stars")
111 order = actor_input.get("order", "desc")
112 max_items = int(actor_input.get("maxItems", 100))
113 token = (actor_input.get("githubToken") or "").strip()
114
115 q = build_search_q(actor_input)
116 if not q:
117 Actor.log.warning(
118 "No search criteria provided. Add a query, language, topic, user, or filter."
119 )
120 await Actor.push_data([])
121 return
122
123 headers = {
124 "Accept": "application/vnd.github+json",
125 "User-Agent": "scrapeworks-github-search/0.1",
126 "X-GitHub-Api-Version": "2022-11-28",
127 }
128 if token:
129 headers["Authorization"] = f"Bearer {token}"
130 Actor.log.info("Using authenticated GitHub requests (higher rate limit).")
131 else:
132 Actor.log.info(
133 "Running unauthenticated (low rate limit ~10 req/min). "
134 "Add a free GitHub token in the input for 5000 req/hr."
135 )
136
137
138 target = min(max_items, GITHUB_MAX_RESULTS)
139 Actor.log.info(f"GitHub search q={q!r} sort={sort} order={order} maxItems={target}")
140
141 pushed = 0
142 page = 1
143 total_count: int | None = None
144
145 async with httpx.AsyncClient(timeout=30.0, headers=headers) as client:
146 while pushed < target:
147 params = {
148 "q": q,
149 "sort": sort,
150 "order": order,
151 "per_page": min(PAGE_SIZE, target - pushed),
152 "page": page,
153 }
154
155 if sort == "best-match":
156 params.pop("sort")
157 params.pop("order")
158
159 url = f"{SEARCH_URL}?{urlencode(params)}"
160
161 data = None
162 for attempt in range(1, 4):
163 try:
164 resp = await client.get(url)
165 if resp.status_code == 403 and "rate limit" in resp.text.lower():
166 wait = attempt * 10
167 Actor.log.warning(
168 f"Rate limited. Backing off {wait}s "
169 f"(add a token to avoid this)..."
170 )
171 await asyncio.sleep(wait)
172 continue
173 if resp.status_code == 422:
174 Actor.log.error(
175 f"GitHub rejected the query (422). Check your filters: {resp.text[:300]}"
176 )
177 return
178 resp.raise_for_status()
179 data = resp.json()
180 break
181 except (httpx.HTTPError, ValueError) as exc:
182 Actor.log.warning(f"Request attempt {attempt} failed: {exc}")
183 if attempt < 3:
184 await asyncio.sleep(attempt * 3)
185
186 if data is None:
187 Actor.log.error(f"Failed to fetch page {page}; stopping.")
188 break
189
190 if total_count is None:
191 total_count = data.get("total_count", 0)
192 Actor.log.info(f"GitHub reports {total_count} matching repositories.")
193
194 items = data.get("items", [])
195 if not items:
196 Actor.log.info("No more results.")
197 break
198
199 batch = []
200 for repo in items:
201 if pushed >= target:
202 break
203 batch.append(transform_repo(repo))
204 pushed += 1
205
206 if batch:
207
208
209
210 await Actor.push_data(batch)
211 Actor.log.info(f"Pushed {pushed}/{target} repositories (page {page}).")
212
213 page += 1
214
215 if total_count is not None and pushed >= total_count:
216 break
217
218 Actor.log.info(f"Done. Returned {pushed} repositories.")