1"""Crates.io Search & Scraper — Apify Actor.
2
3Searches the crates.io API (Rust package registry) and returns clean,
4structured JSON per crate: downloads, recent downloads, versions, links, dates,
5and a computed "activity" signal (active / slowing / stale) derived from
6recency of updates and the share of recent vs. total downloads.
7
8Verified against the live crates.io API response shape.
9"""
10
11from __future__ import annotations
12
13import asyncio
14from datetime import datetime, timezone
15from urllib.parse import urlencode
16
17import httpx
18from apify import Actor
19
20API = "https://crates.io/api/v1/crates"
21SORT_MAP = {"relevance": "relevance", "downloads": "downloads",
22 "recent-downloads": "recent-downloads", "newest": "new",
23 "recently-updated": "recent-updates"}
24
25
26def _parse_dt(value: str | None) -> datetime | None:
27 if not value:
28 return None
29 try:
30 return datetime.fromisoformat(value.replace("Z", "+00:00"))
31 except (ValueError, AttributeError):
32 return None
33
34
35def compute_activity(crate: dict, now: datetime) -> tuple[str, int | None]:
36 """Derive a simple activity signal from update recency.
37
38 Returns (activityLevel, daysSinceUpdate). Pure function of the crate data,
39 so the output is more useful than raw timestamps for filtering/sorting.
40 """
41 updated = _parse_dt(crate.get("updated_at"))
42 if updated is None:
43 return "unknown", None
44 days = int((now - updated).total_seconds() / 86400)
45 if days <= 90:
46 level = "active"
47 elif days <= 365:
48 level = "slowing"
49 elif days <= 730:
50 level = "stale"
51 else:
52 level = "dormant"
53 return level, days
54
55
56def transform(crate: dict, now: datetime) -> dict:
57 activity, days = compute_activity(crate, now)
58 downloads = crate.get("downloads") or 0
59 recent = crate.get("recent_downloads") or 0
60 recent_share = round(recent / downloads, 4) if downloads else None
61 return {
62 "name": crate.get("name") or crate.get("id"),
63 "description": (crate.get("description") or "").strip() or None,
64 "downloads": downloads,
65 "recentDownloads": recent,
66 "recentDownloadShare": recent_share,
67 "maxStableVersion": crate.get("max_stable_version"),
68 "newestVersion": crate.get("newest_version"),
69 "activityLevel": activity,
70 "daysSinceUpdate": days,
71 "repository": crate.get("repository"),
72 "homepage": crate.get("homepage"),
73 "documentation": crate.get("documentation"),
74 "createdAt": crate.get("created_at"),
75 "updatedAt": crate.get("updated_at"),
76 "url": f"https://crates.io/crates/{crate.get('name') or crate.get('id')}",
77 }
78
79
80async def main() -> None:
81 async with Actor:
82 actor_input = await Actor.get_input() or {}
83
84 query = (actor_input.get("query") or "").strip()
85 category = (actor_input.get("category") or "").strip()
86 keyword = (actor_input.get("keyword") or "").strip()
87 sort = SORT_MAP.get(actor_input.get("sort", "relevance"), "relevance")
88 max_items = int(actor_input.get("maxItems", 50))
89
90 if not (query or category or keyword):
91 Actor.log.warning("Provide a search query, category, or keyword.")
92 await Actor.push_data([])
93 return
94
95 Actor.log.info(f"crates.io search: q={query!r} category={category!r} "
96 f"keyword={keyword!r} sort={sort} max={max_items}")
97 now = datetime.now(timezone.utc)
98
99 page_size = min(100, max_items)
100 pushed = 0
101 page = 1
102 total = None
103
104 async with httpx.AsyncClient(
105 timeout=30.0,
106 headers={"User-Agent": "scrapeworks-crates-search/0.1 (https://apify.com/scrapeworks)"},
107 ) as client:
108 while pushed < max_items:
109 params = {
110 "page": page,
111 "per_page": min(page_size, max_items - pushed),
112 "sort": sort,
113 }
114 if query:
115 params["q"] = query
116 if category:
117 params["category"] = category
118 if keyword:
119 params["keyword"] = keyword
120 url = f"{API}?{urlencode(params)}"
121
122 data = None
123 for attempt in range(1, 4):
124 try:
125 resp = await client.get(url)
126 resp.raise_for_status()
127 data = resp.json()
128 break
129 except (httpx.HTTPError, ValueError) as exc:
130 Actor.log.warning(f"Request attempt {attempt} failed: {exc}")
131 if attempt < 3:
132 await asyncio.sleep(attempt * 3)
133
134 if data is None:
135 Actor.log.error(f"Failed to fetch page {page}; stopping.")
136 break
137
138 if total is None:
139 total = (data.get("meta") or {}).get("total")
140 if total is not None:
141 Actor.log.info(f"crates.io reports {total} total matching crates.")
142
143 crates = data.get("crates", [])
144 if not crates:
145 Actor.log.info("No more results.")
146 break
147
148 batch = [transform(c, now) for c in crates[: max_items - pushed]]
149 await Actor.push_data(batch)
150 pushed += len(batch)
151 Actor.log.info(f"Pushed {pushed}/{max_items} crates.")
152
153 page += 1
154 if total is not None and pushed >= total:
155 break
156 await asyncio.sleep(1)
157
158 Actor.log.info(f"Done. Returned {pushed} crates.")