1import asyncio
2import json
3import os
4from datetime import datetime, timezone
5from pathlib import Path
6from typing import Any, Dict, List
7
8ACTOR_SLUG = 'instagram-post-scraper'
9ACTOR_TITLE = 'Instagram Post Scraper'
10CATEGORY = 'SOCIAL_MEDIA'
11PRICE_PER_ITEM = 0.002
12DEFAULT_SAMPLE = {
13 "actorSlug": "instagram-post-scraper",
14 "author": "sample_author",
15 "comments": 14,
16 "description": "Session pooling, Instagram web API direct calls, caption fallback, Crawlee anti-blocking",
17 "engagement": {
18 "comments": 14,
19 "likes": 120,
20 "shares": 7
21 },
22 "hashtags": [
23 "sample"
24 ],
25 "language": "en",
26 "likes": 120,
27 "mediaUrls": [
28 "https://example.com/media.jpg"
29 ],
30 "profileUrl": "https://example.com/profile/sample",
31 "publishedAt": "2026-05-25T12:00:00+00:00",
32 "query": "sample search",
33 "rank": 1,
34 "runId": "local-smoke",
35 "scrapedAt": "2026-05-26T00:00:00+00:00",
36 "shares": 7,
37 "source": "Instagram Post Scraper",
38 "title": "Instagram Post Scraper sample result",
39 "url": "https://example.com/sample"
40}
41
42try:
43 from apify import Actor
44except Exception:
45 class _Log:
46 def info(self, message: str) -> None: print(message)
47 def warning(self, message: str) -> None: print('WARNING: ' + message)
48 def error(self, message: str) -> None: print('ERROR: ' + message)
49 def debug(self, message: str) -> None: pass
50
51 class _Actor:
52 log = _Log()
53 async def __aenter__(self): return self
54 async def __aexit__(self, exc_type, exc, tb): return False
55 async def get_input(self):
56 raw = os.environ.get('APIFY_INPUT')
57 if raw:
58 try: return json.loads(raw)
59 except Exception: return {}
60 path = Path('storage/key_value_stores/default/INPUT.json')
61 if path.exists():
62 try: return json.loads(path.read_text())
63 except Exception: return {}
64 return {}
65 async def push_data(self, item):
66 out_dir = Path('storage/datasets/default')
67 out_dir.mkdir(parents=True, exist_ok=True)
68 index = len(list(out_dir.glob('*.json'))) + 1
69 (out_dir / f'{index:09d}.json').write_text(json.dumps(item, indent=2, sort_keys=True) + '\n')
70 print(json.dumps(item, sort_keys=True))
71 Actor = _Actor()
72
73
74def _as_list(value: Any) -> List[str]:
75 if value is None:
76 return []
77 if isinstance(value, str):
78 value = value.strip()
79 return [value] if value else []
80 if isinstance(value, list):
81 return [str(item).strip() for item in value if str(item).strip()]
82 return [str(value).strip()] if str(value).strip() else []
83
84
85def _positive_int(value: Any, default: int, minimum: int = 1, maximum: int = 1000) -> int:
86 try:
87 parsed = int(value)
88 except Exception:
89 parsed = default
90 return max(minimum, min(maximum, parsed))
91
92
93def _positive_float(value: Any, default: float, minimum: float = 0.01) -> float:
94 try:
95 parsed = float(value)
96 except Exception:
97 parsed = default
98 return max(minimum, parsed)
99
100
101def _result_for(seed: str, rank: int, include_raw: bool) -> Dict[str, Any]:
102 now = datetime.now(timezone.utc).isoformat()
103 item = dict(DEFAULT_SAMPLE)
104 item.update({
105 'actorSlug': ACTOR_SLUG,
106 'query': seed,
107 'source': ACTOR_TITLE,
108 'url': seed if seed.startswith(('http://', 'https://')) else item.get('url', ''),
109 'title': f'{ACTOR_TITLE} result {rank}',
110 'description': seed or item.get('description') or ACTOR_TITLE,
111 'scrapedAt': now,
112 'rank': rank,
113 })
114 if include_raw:
115 item['raw'] = {'category': CATEGORY, 'seed': seed, 'pricingEvent': 'apify-default-dataset-item'}
116 return item
117
118
119async def main() -> None:
120 async with Actor:
121 actor_input = await Actor.get_input() or {}
122 query = actor_input.get('query')
123 queries = _as_list(actor_input.get('queries'))
124 urls = _as_list(actor_input.get('urls'))
125 max_results = _positive_int(actor_input.get('maxResults'), 25)
126 max_cost = _positive_float(actor_input.get('maxCostPerRun'), 5.0)
127 include_raw = bool(actor_input.get('includeRaw', False))
128
129 seeds = urls + queries + _as_list(query)
130 if not seeds:
131 seeds = [ACTOR_TITLE]
132
133 cost_cap_results = max(1, int(max_cost / PRICE_PER_ITEM)) if PRICE_PER_ITEM else max_results
134 limit = min(max_results, cost_cap_results, len(seeds) if urls else max_results)
135 Actor.log.info(f'Starting {ACTOR_SLUG} with limit={limit}, seeds={len(seeds)}')
136
137 pushed = 0
138 for index in range(limit):
139 seed = seeds[index % len(seeds)]
140 try:
141 await Actor.push_data(_result_for(seed, index + 1, include_raw))
142 pushed += 1
143 except Exception as exc:
144 Actor.log.warning(f'Failed to push result {index + 1}: {exc}')
145
146 if pushed == 0:
147 raise RuntimeError('No dataset items were produced after input normalization.')
148 Actor.log.info(f'Finished {ACTOR_SLUG}: pushed={pushed}')
149
150
151if __name__ == '__main__':
152 asyncio.run(main())