1import json
2import httpx
3from .all_config import (
4 API_KEY, SITEMAP_ID
5)
6
7BASE_URL = "https://api.webscraper.io/api/v1/scraping-job"
8
9
10def start_scraping(sitemap_id=SITEMAP_ID, url_=None):
11 """Start a scraping job for a given sitemap ID."""
12 url = f"{BASE_URL}?api_token={API_KEY}" # Ensure API token is included in the URL
13
14 headers = {"Content-Type": "application/json",
15 "Accept": "application/json"}
16 payload = {
17 "sitemap_id": sitemap_id,
18 "driver": "fulljs",
19 "page_load_delay": 2000,
20 "request_interval": 1000,
21 "proxy": 1,
22 "start_urls": [
23 url_
24 ],
25 }
26
27 with httpx.Client() as client:
28 # Use json=payload instead of data=
29 response = client.post(url, headers=headers, json=payload)
30
31 if response.status_code == 200:
32 return response.json()
33 else:
34 print(
35 f"Error starting scraping: {response.status_code} - {response.text}")
36 return {}
37
38# start_scrape: {'success': True, 'data': {'id': 28731019, 'custom_id': None}}
39
40
41def get_scraping_status_by_job(job_id):
42 """Fetches the status of a scraping job."""
43 url = f"{BASE_URL}/{job_id}?api_token={API_KEY}" # Job status endpoint
44
45 with httpx.Client() as client:
46 response = client.get(url)
47
48 if response.status_code == 200:
49 return response.json()
50 else:
51 print(
52 f"Error fetching status: {response.status_code} - {response.text}")
53 return {}
54
55
56# Scraping Job Status:
57# {'success': True, 'data': {'id': 28731019, 'custom_id': None, 'sitemap_id': 1274721, 'status': 'started',
58# 'test_run': 0, 'sitemap_name': 'realestate_au',
59# 'time_created': 1740759317, 'jobs_scheduled': 2,
60# 'jobs_executed': 29, 'jobs_failed': 0,
61# 'stored_record_count': 29, 'request_interval': 1000,
62# 'page_load_delay': 2000, 'driver': 'fulljs',
63# 'jobs_empty': 0, 'jobs_no_value': 0, 'scraping_duration': 207,
64# 'scheduled': 0}}
65
66
67def get_scraping_results(job_id):
68 """Fetches the scraped data of a completed job."""
69 url = f"{BASE_URL}/{job_id}/json?api_token={API_KEY}" # Results endpoint
70
71 with httpx.Client() as client:
72 response = client.get(url)
73
74 if response.status_code == 200:
75 try:
76 # Capture the raw response text
77 data = response.text
78 print("Raw Response Text:")
79 # print(data) # Print the raw response text
80
81 # Process each line in the response as an individual JSON object
82 results = []
83 for line in data.splitlines():
84 try:
85 json_data = json.loads(line) # Parse each line as JSON
86 results.append(json_data)
87 except json.JSONDecodeError:
88 print(f"Error decoding line: {line}")
89 continue
90
91 print("Parsed JSON Data:")
92 # Print the parsed JSON data in a readable format
93 # print(json.dumps(results, indent=4))
94
95 # Save the parsed data to a file
96 with open("scraped_data.json", "w") as json_file:
97 json.dump(results, json_file, indent=4)
98
99 return results
100 except Exception as e:
101 print(f"Error processing response: {e}")
102 return {}
103
104 else:
105 print(f"Error fetching results: {response.status_code}")
106 return {}
107
108
109"""
110Method: POST
111URL: https://api.webscraper.io/api/v1/scraping-job?api_token=<YOUR API TOKEN>
112JSON:
113{
114 "sitemap_id": 123,
115 "driver": "fast", // "fast" or "fulljs"
116 "page_load_delay": 2000,
117 "request_interval": 2000,
118 "proxy": 0, // 0: No proxy, 1: Use proxy, 123: Custom proxy id, 'residential-*': Use residential proxy, replace * with country code, for example, 'residential-us'
119 "start_urls": [ // optional, if set, will overwrite sitemap start URLs
120 "https://www.webscraper.io/test-sites/e-commerce/allinone/computers",
121 "https://www.webscraper.io/test-sites/e-commerce/allinone/phones"
122 ],
123 "custom_id": "custom-scraping-job-12" // optional, will be included in webhook notification
124}
125
126"""