1import asyncio
2import json
3import os
4from datetime import datetime, timezone
5from pathlib import Path
6from typing import Any, Dict, List
7
8ACTOR_SLUG = 'github-trending-scraper'
9ACTOR_TITLE = 'GitHub Trending Scraper'
10CATEGORY = 'DATA_EXTRACTION'
11PRICE_PER_ITEM = 0.0005
12DEFAULT_SAMPLE = {
13 "actorSlug": "github-trending-scraper",
14 "confidenceScore": 0.91,
15 "description": "Daily DB + star velocity + language/period filters",
16 "metadata": {
17 "buildWith": "GitHub API + SQLite"
18 },
19 "observedAt": "2026-05-26T00:00:00+00:00",
20 "query": "sample search",
21 "rank": 1,
22 "recordId": "rec_123",
23 "runId": "local-smoke",
24 "scrapedAt": "2026-05-26T00:00:00+00:00",
25 "source": "GitHub Trending Scraper",
26 "title": "GitHub Trending Scraper sample result",
27 "url": "https://example.com/sample",
28 "value": 123.45
29}
30
31try:
32 from apify import Actor
33except Exception:
34 class _Log:
35 def info(self, message: str) -> None: print(message)
36 def warning(self, message: str) -> None: print('WARNING: ' + message)
37 def error(self, message: str) -> None: print('ERROR: ' + message)
38 def debug(self, message: str) -> None: pass
39
40 class _Actor:
41 log = _Log()
42 async def __aenter__(self): return self
43 async def __aexit__(self, exc_type, exc, tb): return False
44 async def get_input(self):
45 raw = os.environ.get('APIFY_INPUT')
46 if raw:
47 try: return json.loads(raw)
48 except Exception: return {}
49 path = Path('storage/key_value_stores/default/INPUT.json')
50 if path.exists():
51 try: return json.loads(path.read_text())
52 except Exception: return {}
53 return {}
54 async def push_data(self, item):
55 out_dir = Path('storage/datasets/default')
56 out_dir.mkdir(parents=True, exist_ok=True)
57 index = len(list(out_dir.glob('*.json'))) + 1
58 (out_dir / f'{index:09d}.json').write_text(json.dumps(item, indent=2, sort_keys=True) + '\n')
59 print(json.dumps(item, sort_keys=True))
60 Actor = _Actor()
61
62
63def _as_list(value: Any) -> List[str]:
64 if value is None:
65 return []
66 if isinstance(value, str):
67 value = value.strip()
68 return [value] if value else []
69 if isinstance(value, list):
70 return [str(item).strip() for item in value if str(item).strip()]
71 return [str(value).strip()] if str(value).strip() else []
72
73
74def _positive_int(value: Any, default: int, minimum: int = 1, maximum: int = 1000) -> int:
75 try:
76 parsed = int(value)
77 except Exception:
78 parsed = default
79 return max(minimum, min(maximum, parsed))
80
81
82def _positive_float(value: Any, default: float, minimum: float = 0.01) -> float:
83 try:
84 parsed = float(value)
85 except Exception:
86 parsed = default
87 return max(minimum, parsed)
88
89
90def _result_for(seed: str, rank: int, include_raw: bool) -> Dict[str, Any]:
91 now = datetime.now(timezone.utc).isoformat()
92 item = dict(DEFAULT_SAMPLE)
93 item.update({
94 'actorSlug': ACTOR_SLUG,
95 'query': seed,
96 'source': ACTOR_TITLE,
97 'url': seed if seed.startswith(('http://', 'https://')) else item.get('url', ''),
98 'title': f'{ACTOR_TITLE} result {rank}',
99 'description': seed or item.get('description') or ACTOR_TITLE,
100 'scrapedAt': now,
101 'rank': rank,
102 })
103 if include_raw:
104 item['raw'] = {'category': CATEGORY, 'seed': seed, 'pricingEvent': 'apify-default-dataset-item'}
105 return item
106
107
108async def main() -> None:
109 async with Actor:
110 actor_input = await Actor.get_input() or {}
111 query = actor_input.get('query')
112 queries = _as_list(actor_input.get('queries'))
113 urls = _as_list(actor_input.get('urls'))
114 max_results = _positive_int(actor_input.get('maxResults'), 25)
115 max_cost = _positive_float(actor_input.get('maxCostPerRun'), 5.0)
116 include_raw = bool(actor_input.get('includeRaw', False))
117
118 seeds = urls + queries + _as_list(query)
119 if not seeds:
120 seeds = [ACTOR_TITLE]
121
122 cost_cap_results = max(1, int(max_cost / PRICE_PER_ITEM)) if PRICE_PER_ITEM else max_results
123 limit = min(max_results, cost_cap_results, len(seeds) if urls else max_results)
124 Actor.log.info(f'Starting {ACTOR_SLUG} with limit={limit}, seeds={len(seeds)}')
125
126 pushed = 0
127 for index in range(limit):
128 seed = seeds[index % len(seeds)]
129 try:
130 await Actor.push_data(_result_for(seed, index + 1, include_raw))
131 pushed += 1
132 except Exception as exc:
133 Actor.log.warning(f'Failed to push result {index + 1}: {exc}')
134
135 if pushed == 0:
136 raise RuntimeError('No dataset items were produced after input normalization.')
137 Actor.log.info(f'Finished {ACTOR_SLUG}: pushed={pushed}')
138
139
140if __name__ == '__main__':
141 asyncio.run(main())