1"""Anthropic-backed LLM extractor.
2
3Forces structured output via Claude tool use (`tool_choice` set to the
4`extract_deals` tool). Validates each returned item against the strict
5Pydantic `Deal` schema and discards any that fail validation.
6
7Provenance fields (`id`, `source_urls`, `source_type`, `first_seen_at`,
8`last_verified_at`, `verification_status`) are injected by us — the model
9never sees them and never produces them.
10"""
11from __future__ import annotations
12
13import logging
14import re
15from datetime import datetime, timezone
16from typing import Any, Dict, List
17
18from anthropic import Anthropic
19from pydantic import ValidationError
20
21from src.models import Deal, VerificationStatus
22
23logger = logging.getLogger(__name__)
24
25
26
27DEFAULT_MODEL = "claude-haiku-4-5-20251001"
28FALLBACK_MODEL = "claude-haiku-4-5"
29
30
31
32
33EXTRACT_DEALS_TOOL: Dict[str, Any] = {
34 "name": "extract_deals",
35 "description": (
36 "Return validated startup deals extracted from the provided source markdown. "
37 "Return an empty array if the page does not describe a public, currently-live deal."
38 ),
39 "input_schema": {
40 "type": "object",
41 "properties": {
42 "deals": {
43 "type": "array",
44 "description": "Array of distinct, public deals on the page.",
45 "items": {
46 "type": "object",
47 "additionalProperties": False,
48 "properties": {
49 "provider": {"type": "string"},
50 "provider_url": {
51 "type": "string",
52 "description": "Canonical homepage URL of the provider, e.g. https://vercel.com",
53 },
54 "program_name": {"type": "string"},
55 "deal_type": {
56 "type": "string",
57 "enum": [
58 "credits", "free_period", "percent_off",
59 "discounted_plan", "lifetime_deal",
60 "free_tier_upgrade", "waived_fees", "bundle",
61 ],
62 },
63 "headline": {"type": "string", "maxLength": 200},
64 "description": {"type": "string", "maxLength": 2000},
65 "credit_value_usd": {"type": "number", "minimum": 0},
66 "discount_percent": {"type": "number", "minimum": 0, "maximum": 100},
67 "free_period_months": {"type": "integer", "minimum": 0},
68 "duration_months": {"type": "integer", "minimum": 0},
69 "estimated_total_value_usd": {"type": "number", "minimum": 0},
70 "eligibility_tags": {
71 "type": "array",
72 "items": {
73 "type": "string",
74 "enum": [
75 "open", "startup_pre_seed", "startup_seed",
76 "startup_series_a", "funding_cap",
77 "accelerator_required", "vc_partner_required",
78 "student", "oss_maintainer", "new_customer_only",
79 "nonprofit", "edu", "paid_newsletter_sub",
80 ],
81 },
82 },
83 "funding_cap_usd": {"type": "number", "minimum": 0},
84 "requires_partner": {"type": "string", "maxLength": 200},
85 "geographic_restrictions": {
86 "type": "array", "items": {"type": "string"},
87 },
88 "application_url": {"type": "string"},
89 "promo_code": {
90 "type": "string", "maxLength": 80,
91 "description": "PUBLIC promo code only — never personalized codes",
92 },
93 "requires_application": {"type": "boolean"},
94 "expected_response_time_days": {"type": "integer", "minimum": 0},
95 "starts_at": {"type": "string", "description": "ISO date YYYY-MM-DD if known"},
96 "expires_at": {"type": "string", "description": "ISO date YYYY-MM-DD if known"},
97 "is_recurring": {"type": "boolean"},
98 "verification_method": {
99 "type": "string",
100 "enum": ["page_present", "code_redemption_check", "human", "none"],
101 },
102 "confidence_score": {"type": "number", "minimum": 0, "maximum": 1},
103 "extraction_notes": {"type": "string", "maxLength": 500},
104 "category": {
105 "type": "array",
106 "items": {
107 "type": "string",
108 "enum": [
109 "ai_tools", "hosting", "database", "devops", "design",
110 "productivity", "communication", "analytics", "cloud",
111 "payments", "email", "security", "auth", "video",
112 ],
113 },
114 },
115 "tags": {"type": "array", "items": {"type": "string"}},
116 },
117 "required": [
118 "provider", "provider_url", "program_name", "deal_type",
119 "headline", "verification_method", "confidence_score",
120 ],
121 },
122 }
123 },
124 "required": ["deals"],
125 },
126}
127
128
129SYSTEM_PROMPT = (
130 "You extract structured startup deal records from public 'for startups' marketing pages "
131 "and aggregator listings. Output ONLY publicly-announced deals with public terms — never "
132 "personalized promo codes, never deals that require a personal invite or referral.\n\n"
133 "If a page describes one big program with one offer, return one deal. If it lists many "
134 "sub-deals (e.g. an aggregator), return one deal per concrete provider+program.\n\n"
135 "Confidence calibration:\n"
136 " - 0.9+ : page explicitly states the dollar value, eligibility, and how to apply\n"
137 " - 0.6-0.8 : the program is described but lacks concrete numbers\n"
138 " - <0.5 : vague mention only — the host will discard these\n\n"
139 "Always set verification_method='page_present' unless you actually verified a redemption "
140 "code. NEVER fabricate dollar values or expiry dates. If unsure, omit the field."
141)
142
143
144_SLUG_NONALNUM = re.compile(r"[^a-z0-9]+")
145
146
147def slugify(s: str, max_len: int = 30) -> str:
148 s2 = _SLUG_NONALNUM.sub("-", (s or "").lower()).strip("-")
149 return (s2 or "x")[:max_len].strip("-") or "x"
150
151
152def make_deal_id(source_slug: str, provider: str, program_name: str) -> str:
153 raw = f"{slugify(source_slug, 24)}--{slugify(provider, 24)}--{slugify(program_name, 24)}"
154 raw = raw[:80].strip("-")
155 return raw or "deal"
156
157
158class LLMExtractor:
159 def __init__(self, api_key: str, model: str = DEFAULT_MODEL):
160 self.client = Anthropic(api_key=api_key)
161 self.model = model
162 self._used_fallback = False
163
164 def _call(self, system: str, user_msg: str) -> Any:
165 """Call Anthropic with model fallback on model-not-found errors."""
166 try:
167 return self.client.messages.create(
168 model=self.model,
169 max_tokens=4096,
170 system=system,
171 tools=[EXTRACT_DEALS_TOOL],
172 tool_choice={"type": "tool", "name": "extract_deals"},
173 messages=[{"role": "user", "content": user_msg}],
174 )
175 except Exception as e:
176 msg = str(e).lower()
177 if (
178 not self._used_fallback
179 and ("not found" in msg or "model" in msg and "invalid" in msg)
180 ):
181 logger.warning(
182 "Model %s rejected; falling back to %s",
183 self.model, FALLBACK_MODEL,
184 )
185 self.model = FALLBACK_MODEL
186 self._used_fallback = True
187 return self.client.messages.create(
188 model=self.model,
189 max_tokens=4096,
190 system=system,
191 tools=[EXTRACT_DEALS_TOOL],
192 tool_choice={"type": "tool", "name": "extract_deals"},
193 messages=[{"role": "user", "content": user_msg}],
194 )
195 raise
196
197 def extract(
198 self,
199 *,
200 source_slug: str,
201 source_url: str,
202 source_type: str,
203 markdown: str,
204 ) -> List[Deal]:
205 if not markdown or not markdown.strip():
206 return []
207
208
209 capped = markdown[:60_000]
210 user_msg = (
211 f"Source URL: {source_url}\n"
212 f"Source type: {source_type}\n\n"
213 f"=== PAGE MARKDOWN START ===\n{capped}\n=== PAGE MARKDOWN END ===\n\n"
214 "Call the extract_deals tool with all distinct, public, currently-live "
215 "deals you can identify. Return an empty array if none."
216 )
217
218 try:
219 resp = self._call(SYSTEM_PROMPT, user_msg)
220 except Exception:
221 logger.exception("Anthropic call failed for %s", source_url)
222 return []
223
224 deals_raw: List[Dict[str, Any]] = []
225 for block in getattr(resp, "content", []):
226 if getattr(block, "type", None) == "tool_use" and getattr(block, "name", "") == "extract_deals":
227 payload = getattr(block, "input", {}) or {}
228 deals_raw = list(payload.get("deals", []))
229 break
230
231 now = datetime.now(timezone.utc)
232 out: List[Deal] = []
233 for raw in deals_raw:
234 try:
235 provider = (raw.get("provider") or "").strip()
236 program_name = (raw.get("program_name") or "").strip()
237 if not provider or not program_name:
238 logger.warning(
239 "Discarding deal with missing provider/program from %s",
240 source_url,
241 )
242 continue
243
244
245 raw["id"] = make_deal_id(source_slug, provider, program_name)
246 raw["source_urls"] = [source_url]
247 raw["source_type"] = source_type
248 raw["first_seen_at"] = now
249 raw["last_verified_at"] = now
250 raw["verification_status"] = VerificationStatus.likely_live.value
251
252 deal = Deal.model_validate(raw)
253 out.append(deal)
254 except ValidationError as ve:
255 logger.warning(
256 "Discarding invalid deal from %s: %s",
257 source_url,
258 [
259 {"loc": e.get("loc"), "msg": e.get("msg")}
260 for e in ve.errors()[:3]
261 ],
262 )
263 except Exception as e:
264 logger.warning("Discarding deal from %s due to error: %s", source_url, e)
265
266 return out