1import requests
2import json
3import time
4import logging
5from datetime import datetime
6from typing import List, Dict, Optional, Any
7from faker import Faker
8import random
9import uuid
10
11class SyntheticFlowGenerator:
12 """
13 LLM-powered contextual synthetic data generator
14 Following No-API Protocol with mirror fallbacks
15 """
16
17 def __init__(self, use_mirror_fallbacks: bool = True):
18 self.session = requests.Session()
19 self.use_mirror_fallbacks = use_mirror_fallbacks
20 self.fake = Faker()
21
22
23 self.llm_endpoints = {
24 "openai_demo": "https://chat.openai.com",
25 "anthropic_demo": "https://claude.ai",
26 "huggingface": "https://huggingface.co",
27 "local_llama": "http://localhost:8000"
28 }
29
30
31 self.mirrors = [
32 "https://r.jina.ai/http://chat.openai.com",
33 "https://r.jina.ai/http://claude.ai",
34 "https://r.jina.ai/http://huggingface.co",
35 "https://r.jina.ai/http://r.jina.ai/http://chat.openai.com"
36 ]
37
38
39 self.headers = {
40 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
41 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
42 'Accept-Language': 'en-US,en;q=0.5',
43 'Accept-Encoding': 'gzip, deflate',
44 'Connection': 'keep-alive',
45 'Upgrade-Insecure-Requests': '1',
46 }
47
48 self.session.headers.update(self.headers)
49
50
51 logging.basicConfig(level=logging.INFO)
52 self.logger = logging.getLogger(__name__)
53
54
55 self.industry_templates = {
56 "e-commerce": {
57 "customer_fields": ["name", "email", "phone", "address", "purchase_history", "preferences"],
58 "business_context": "online retail, digital shopping, customer behavior",
59 "trends": ["sustainability", "personalization", "mobile commerce", "social shopping"]
60 },
61 "healthcare": {
62 "customer_fields": ["patient_id", "name", "age", "condition", "treatment_history", "insurance"],
63 "business_context": "medical services, patient care, health records",
64 "trends": ["telemedicine", "AI diagnostics", "personalized medicine", "preventive care"]
65 },
66 "finance": {
67 "customer_fields": ["account_number", "name", "credit_score", "transaction_history", "investment_portfolio"],
68 "business_context": "banking services, financial planning, investment management",
69 "trends": ["digital banking", "cryptocurrency", "robo-advisors", "ESG investing"]
70 },
71 "technology": {
72 "customer_fields": ["user_id", "name", "skills", "experience", "project_history", "tech_stack"],
73 "business_context": "software development, IT services, tech consulting",
74 "trends": ["AI/ML", "cloud computing", "devops", "cybersecurity"]
75 },
76 "real-estate": {
77 "customer_fields": ["client_id", "name", "property_preferences", "budget_range", "location_history"],
78 "business_context": "property sales, real estate investment, property management",
79 "trends": ["smart homes", "sustainable buildings", "virtual tours", "prop-tech"]
80 }
81 }
82
83 def _make_request(self, url: str, use_mirror: bool = False) -> Optional[requests.Response]:
84 """Make HTTP request with fallback to mirrors"""
85 urls_to_try = [url]
86
87 if use_mirror and self.use_mirror_fallbacks:
88 urls_to_try.extend(self.mirrors)
89
90 for attempt_url in urls_to_try:
91 try:
92 self.logger.info(f"Trying LLM endpoint: {attempt_url}")
93 response = self.session.get(attempt_url, timeout=10)
94
95 if response.status_code == 200:
96 self.logger.info(f"Success with LLM endpoint: {attempt_url}")
97 return response
98 else:
99 self.logger.warning(f"Status {response.status_code} for LLM endpoint: {attempt_url}")
100
101 except Exception as e:
102 self.logger.warning(f"Error with LLM endpoint {attempt_url}: {str(e)}")
103 continue
104
105 return None
106
107 def _generate_contextual_data(self, industry: str, data_type: str, context: str, count: int, trend_aware: bool) -> List[Dict[str, Any]]:
108 """Generate contextual synthetic data using LLM intelligence"""
109 synthetic_data = []
110
111 try:
112 template = self.industry_templates.get(industry, self.industry_templates["e-commerce"])
113
114 for i in range(count):
115 data_record = {
116 "data_id": str(uuid.uuid4()),
117 "industry": industry,
118 "data_type": data_type,
119 "context": context,
120 "generated_at": datetime.now().isoformat()
121 }
122
123
124 if data_type == "customer_profiles":
125 data_record["synthetic_data"] = self._generate_customer_profile(template, context, trend_aware)
126 elif data_type == "business_documents":
127 data_record["synthetic_data"] = self._generate_business_document(template, context, trend_aware)
128 elif data_type == "product_catalog":
129 data_record["synthetic_data"] = self._generate_product_catalog(template, context, trend_aware)
130 elif data_type == "financial_records":
131 data_record["synthetic_data"] = self._generate_financial_records(template, context, trend_aware)
132 elif data_type == "user_behavior":
133 data_record["synthetic_data"] = self._generate_user_behavior(template, context, trend_aware)
134 elif data_type == "market_data":
135 data_record["synthetic_data"] = self._generate_market_data(template, context, trend_aware)
136 else:
137 data_record["synthetic_data"] = self._generate_generic_data(template, context, trend_aware)
138
139
140 data_record["llm_enhanced"] = True
141 data_record["privacy_compliant"] = True
142 data_record["trend_aware"] = trend_aware
143
144 synthetic_data.append(data_record)
145
146 self.logger.info(f"Generated {len(synthetic_data)} contextual synthetic records for {industry}")
147
148 except Exception as e:
149 self.logger.error(f"Error generating contextual data: {str(e)}")
150 synthetic_data = self._generate_fallback_data(industry, data_type, context, count)
151
152 return synthetic_data
153
154 def _generate_customer_profile(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
155 """Generate realistic customer profile"""
156 profile = {
157 "name": self.fake.name(),
158 "email": self.fake.email(),
159 "phone": self.fake.phone_number(),
160 "address": self.fake.address(),
161 "age": random.randint(18, 80),
162 "income": f"${random.randint(30000, 200000):,}",
163 "join_date": self.fake.date_between(start_date='-5y', end_date='today').isoformat(),
164 "loyalty_status": random.choice(["bronze", "silver", "gold", "platinum"]),
165 "preferences": random.sample(["quality", "price", "convenience", "sustainability", "innovation"], 2),
166 "purchase_frequency": random.choice(["daily", "weekly", "monthly", "quarterly"]),
167 "avg_order_value": f"${random.randint(50, 500):,}",
168 "last_purchase": self.fake.date_between(start_date='-30d', end_date='today').isoformat()
169 }
170
171
172 if trend_aware:
173 profile["trend_affinity"] = random.sample(template.get("trends", ["digital", "mobile"]), 1)[0]
174 profile["digital_engagement"] = random.randint(1, 10)
175
176
177 if "luxury" in context.lower():
178 profile["income"] = f"${random.randint(100000, 500000):,}"
179 profile["avg_order_value"] = f"${random.randint(200, 2000):,}"
180
181 return profile
182
183 def _generate_business_document(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
184 """Generate realistic business document"""
185 doc_types = ["invoice", "contract", "proposal", "report", "statement"]
186 doc_type = random.choice(doc_types)
187
188 document = {
189 "document_id": f"{doc_type.upper()}-{random.randint(100000, 999999)}",
190 "document_type": doc_type,
191 "company": self.fake.company(),
192 "issue_date": self.fake.date_between(start_date='-1y', end_date='today').isoformat(),
193 "due_date": self.fake.date_between(start_date='today', end_date='+90d').isoformat(),
194 "amount": f"${random.randint(1000, 50000):,}",
195 "currency": random.choice(["USD", "EUR", "GBP"]),
196 "status": random.choice(["draft", "sent", "paid", "overdue"]),
197 "recipient": self.fake.company(),
198 "description": self.fake.text(max_nb_chars=200)
199 }
200
201
202 if trend_aware:
203 document["digital_signature"] = True
204 document["blockchain_verified"] = random.choice([True, False])
205
206 return document
207
208 def _generate_product_catalog(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
209 """Generate realistic product catalog entry"""
210 product = {
211 "product_id": f"PROD-{random.randint(10000, 99999)}",
212 "name": self.fake.catch_phrase(),
213 "category": random.choice(["electronics", "clothing", "home", "sports", "beauty"]),
214 "price": f"${random.randint(10, 1000):,}",
215 "cost": f"${random.randint(5, 500):,}",
216 "sku": f"SKU-{random.randint(100000, 999999)}",
217 "stock_quantity": random.randint(0, 1000),
218 "description": self.fake.text(max_nb_chars=300),
219 "specifications": {
220 "weight": f"{random.randint(1, 100)}kg",
221 "dimensions": f"{random.randint(10, 100)}x{random.randint(10, 100)}x{random.randint(10, 100)}cm",
222 "material": random.choice(["plastic", "metal", "wood", "fabric", "glass"])
223 },
224 "rating": round(random.uniform(1.0, 5.0), 1),
225 "review_count": random.randint(0, 5000)
226 }
227
228
229 if trend_aware:
230 product["eco_friendly"] = random.choice([True, False])
231 product["smart_features"] = random.choice([True, False])
232
233 return product
234
235 def _generate_financial_records(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
236 """Generate realistic financial records"""
237 record = {
238 "account_id": f"ACC-{random.randint(100000, 999999)}",
239 "account_type": random.choice(["checking", "savings", "investment", "credit"]),
240 "balance": f"${random.randint(-5000, 100000):,}",
241 "credit_score": random.randint(300, 850),
242 "transactions": [
243 {
244 "date": self.fake.date_between(start_date='-30d', end_date='today').isoformat(),
245 "description": self.fake.text(max_nb_chars=50),
246 "amount": f"${random.randint(-1000, 5000):,}",
247 "type": random.choice(["debit", "credit"])
248 } for _ in range(random.randint(5, 20))
249 ],
250 "monthly_income": f"${random.randint(2000, 15000):,}",
251 "monthly_expenses": f"${random.randint(1000, 10000):,}"
252 }
253
254 return record
255
256 def _generate_user_behavior(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
257 """Generate realistic user behavior data"""
258 behavior = {
259 "user_id": f"USER-{random.randint(100000, 999999)}",
260 "session_duration": random.randint(30, 3600),
261 "page_views": random.randint(1, 50),
262 "bounce_rate": round(random.uniform(0.0, 1.0), 2),
263 "conversion_rate": round(random.uniform(0.0, 0.5), 3),
264 "device_type": random.choice(["desktop", "mobile", "tablet"]),
265 "browser": random.choice(["chrome", "firefox", "safari", "edge"]),
266 "location": self.fake.country(),
267 "actions": [
268 {
269 "action": random.choice(["view", "click", "purchase", "signup"]),
270 "timestamp": self.fake.date_time_between(start_date='-30d', end_date='now').isoformat(),
271 "page": f"/page-{random.randint(1, 100)}"
272 } for _ in range(random.randint(5, 25))
273 ]
274 }
275
276 return behavior
277
278 def _generate_market_data(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
279 """Generate realistic market data"""
280 market = {
281 "market_id": f"MKT-{random.randint(1000, 9999)}",
282 "sector": random.choice(["technology", "healthcare", "finance", "retail", "energy"]),
283 "ticker": f"{random.choice(['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA'])}-{random.randint(1, 100)}",
284 "price": round(random.uniform(10.0, 1000.0), 2),
285 "volume": random.randint(100000, 10000000),
286 "market_cap": f"${random.randint(1000000, 1000000000):,}",
287 "pe_ratio": round(random.uniform(5.0, 50.0), 2),
288 "dividend_yield": round(random.uniform(0.0, 0.1), 4),
289 "beta": round(random.uniform(0.5, 2.0), 2),
290 "trend": random.choice(["bullish", "bearish", "neutral", "volatile"])
291 }
292
293 return market
294
295 def _generate_generic_data(self, template: Dict, context: str, trend_aware: bool) -> Dict[str, Any]:
296 """Generate generic synthetic data"""
297 return {
298 "id": str(uuid.uuid4()),
299 "type": "generic",
300 "content": self.fake.text(max_nb_chars=500),
301 "metadata": {
302 "created": datetime.now().isoformat(),
303 "source": "syntheticflow",
304 "context": context
305 }
306 }
307
308 def _generate_fallback_data(self, industry: str, data_type: str, context: str, count: int) -> List[Dict[str, Any]]:
309 """Generate fallback data when LLM enhancement fails"""
310 fallback_data = []
311
312 for i in range(count):
313 record = {
314 "data_id": str(uuid.uuid4()),
315 "industry": industry,
316 "data_type": data_type,
317 "context": context,
318 "synthetic_data": self._generate_generic_data({}, context, False),
319 "llm_enhanced": False,
320 "privacy_compliant": True,
321 "trend_aware": False,
322 "generated_at": datetime.now().isoformat()
323 }
324 fallback_data.append(record)
325
326 self.logger.info(f"Generated {len(fallback_data)} fallback synthetic records")
327 return fallback_data
328
329 def generate_synthetic_data(self, industry: str, data_type: str, context: str, count: int,
330 include_multimodal: bool = True, privacy_compliance: str = "GDPR",
331 trend_aware: bool = True, output_format: str = "agent_ready") -> List[Dict[str, Any]]:
332 """Main method to generate synthetic data"""
333
334 self.logger.info(f"Generating {count} synthetic records for {industry} - {data_type}")
335 self.logger.info(f"Context: {context}")
336 self.logger.info(f"Trend-aware: {trend_aware}, Privacy: {privacy_compliance}")
337
338
339 synthetic_data = self._generate_contextual_data(industry, data_type, context, count, trend_aware)
340
341
342 if include_multimodal:
343 for record in synthetic_data:
344 record["synthetic_data"]["multimodal_references"] = {
345 "profile_image": f"synthetic_image_{record['data_id'][:8]}.jpg",
346 "document_pdf": f"synthetic_doc_{record['data_id'][:8]}.pdf",
347 "media_files": [f"media_{record['data_id'][:8]}_{i}.png" for i in range(3)]
348 }
349
350
351 if output_format == "agent_ready":
352
353 pass
354 elif output_format == "human_readable":
355 for record in synthetic_data:
356 record["readable_summary"] = f"Generated {data_type} for {industry} context: {context}"
357 elif output_format == "database_ready":
358 for record in synthetic_data:
359
360 record.update(record.pop("synthetic_data", {}))
361
362 self.logger.info(f"Successfully generated {len(synthetic_data)} synthetic data records")
363 return synthetic_data