nepali-dataset-crawler
Under maintenance
Pricing
Pay per usage
Go to Store
nepali-dataset-crawler
Under maintenance
crawl nepali dataset from online news sites.
0.0 (0)
Pricing
Pay per usage
1
Total users
6
Monthly users
4
Runs succeeded
>99%
Last modified
10 months ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Getting started with Python and Scrapy", "description": "Scrapes titles of websites using Scrapy.", "version": "0.0", "meta": { "templateId": "python-scrapy" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Python Scrapy Scraper", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "proxyConfiguration": { "sectionCaption": "Proxy and HTTP configuration", "title": "Proxy configuration", "type": "object", "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", "editor": "proxy", "prefill": { "useApifyProxy": true }, "default": { "useApifyProxy": true } } }, "required": ["startUrls"]}
src/spiders/__init__.py
1"""2Scrapy spiders package3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape5and process data from websites.6
7For detailed information on creating and utilizing spiders, refer to the official documentation:8https://docs.scrapy.org/en/latest/topics/spiders.html9"""
src/spiders/functions.py
1import langid2import re3import os, sys, json, csv4# import pandas as pd5
6# import pybloom_live7from pathlib import Path8from urllib.parse import urlparse9
10def is_valid_text_naive(text):11 stripped_text = text.strip()12 return stripped_text and len(stripped_text) > 313
14def remove_fragments_from_url(url):15 parsed_url = urlparse(url)16 return parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path17
18
19def load_env_var_in_google_colab():20 # Assuming environment variables are set in google colab Secrets21
22 if 'google.colab' in sys.modules:23 # Code is running in google colab24 try:25 from google.colab import userdata26 environ_variables = ["REDIS_HOST", "REDIS_PASSWORD", "REDIS_PORT", "mongo_password", "mongo_username"]27 for env_var in environ_variables:28 try:29 os.environ[env_var] = userdata.get(env_var)30 except Exception as Ex:31 print(Ex)32 # break33 os.environ.get("mongo_password")34 except Exception as Ex:35 print(Ex)36
37def merge_crawled_json_files():38 '''39 * Crawled data is saved in .json files.40 * There are multiple .json files.41 * this functionoi Combines them.42 '''43 data_files = set([44 file.split('_')[0].split('.json')[0] for file in os.listdir()45 if (file.endswith('.json')) and (46 file not in47 ['test.json', 'news_start_urls copy.json', 'news_start_urls.json'])48 ])49 merged_data = []50 for file in data_files:51 if remove_file_if_empty(file):52 # Removed empty file53 continue54 try:55 with open(file, 'r') as f:56 data = json.load(f)57 except:58 '''59 file is corrupt when scrapy is terminated while it is still crawling.60 while corrupt, file is terminated with: `{some_data},`61 adding : `""]` at the end of file to make it valid json file62
63 '''64 with open(new_file_name, 'a') as file:65 file.write("\"\"]")66
67 with open(new_file_name, 'r') as file:68 data = json.load(file)69 merged_data.extends(data)70
71def merge_same_named_json_files(delete_merged=False):72 '''73 * Crawled data is saved in .json files.74 * There are multiple .json files.75 * this functionoi Combines them.76 '''77 data_files = [78 file for file in os.listdir() if (file.endswith('.json')) and (79 file not in80 ['test.json', 'news_start_urls copy.json', 'news_start_urls.json']) and not file.endswith('_merged_.json')81 ]82 merged_data = []83 merged_file_name = '_merged_.json'84 for file in data_files:85 # merge only if it is less than 10 Mb86 if os.path.getsize(file) > 10000000:87 continue88 # merged_file_name = file.split('_')[0].split(89 # '.json')[0] + '_merged_.json'90 # remove_file_if_empty(merged_file_name)91 print(f'\n\n merged_file_name: {merged_file_name}\n')92 if remove_file_if_empty(file):93 # Removed empty file94 continue95 try:96 with open(file, 'r') as f:97 data = json.load(f)98 except:99 '''100 file is corrupt when scrapy is terminated while it is still crawling.101 while corrupt, file is terminated with: `{some_data},`102 adding : `""]` at the end of file to make it valid json file103
104 '''105 with open(file, 'a') as f:106 f.write(",{}]")107 print(f'\n\n file: {file}\n')108 with open(file, 'r') as f:109 data = json.load(f)110 # load if file exists111 if os.path.exists(merged_file_name):112 try:113 with open(merged_file_name, 'r') as f:114 old_data = json.load(f)115 if not old_data:116 old_data = []117 old_data.extend(data)118 except:119 print(f'\n\n-------------- Not merged_file_name: {merged_file_name}\n')120 continue121 else:122 old_data = data123 if not old_data:124 old_data = []125 if old_data:126 with open(merged_file_name, 'w') as f:127 json.dump(old_data, f)128 if delete_merged:129 os.remove(file)130
131def compress_file(input_file_path="nepali_news_dataset.csv",132 output_file_path="nepali_news_dataset.csv.gz"):133 '''134 '''135 with open(input_file_path, 'rb') as csv_file:136 with gzip.open(output_file_path, 'wb') as compressed_file:137 compressed_file.writelines(csv_file)138
139
140# def save_nepali_paragraphs_to_csv(csv_file_name = "crawled_nepali_news_dataset.csv"):141# '''142# '''143# data_files = [144# file for file in os.listdir() if (file.endswith('.json')) and (145# file not in146# ['test.json', 'news_start_urls copy.json', 'news_start_urls.json'])147# ]148# for file in data_files:149# if remove_file_if_empty(file):150# continue151# # Removed empty file152# try:153# # load data from each file154# with open(file, 'r') as f:155# data = json.load(f)156# except:157# '''158# file is corrupt when scrapy is terminated while it is still crawling.159# while corrupt, file is terminated with: `{some_data},`160# adding : `""]` at the end of file to make it valid json file161
162# '''163# with open(file, 'a') as f:164# f.write(",{}]")165# with open(file, 'r') as f:166# data = json.load(f)167# nepali_paragraphs = []168# for d in data:169# if type(d) == dict:170# if 'paragraph' in d.keys():171# nepali_paragraphs.append(d)172# # print(list(d.values()))173# # # save dataset in a csv file174# # with open(csv_file_name, 'a', newline='') as csv_file:175# # # Create a CSV writer object176# # csv_writer = csv.writer(csv_file)177# # # Write the data to the CSV file178# # csv_writer.writerows(list(d.values()))179# print(f'len_nepali_paragraphs:{len(nepali_paragraphs)} keys:{nepali_paragraphs[0].keys()}')180# # Open the CSV file in append mode181# with open(csv_file_name, 'a', newline='') as csv_file:182# # Create a CSV writer object183# csv_writer = csv.writer(csv_file)184
185# # Check if the CSV file is empty186# is_empty = os.path.getsize(csv_file_name) == 0187 188# # Extract unique column names from dictionary keys189# column_names = set(key for d in nepali_paragraphs for key in d.keys())190 191# # Write the header only if the CSV file is empty192# if is_empty:193# csv_writer.writerow(column_names)194 195# # Iterate through each dictionary and write rows to CSV196# for d in nepali_paragraphs:197# # Create a list of values for the row, using empty string for missing keys198# row_values = [str(d.get(column, '')) for column in column_names]199# csv_writer.writerow(row_values)200# # ----------------------------201# # Drop duplicate rows in csv202# # ----------------------------203 204# # Read the CSV file into a DataFrame205# df = pd.read_csv(csv_file_name)206# # Drop duplicate rows207# df.drop_duplicates(inplace=True)208# # Write the cleaned DataFrame back to the CSV file209# df.to_csv(csv_file_name, index=False)210
211def remove_file_if_empty(file_path):212 """Checks if a file is empty and removes it if it is.213
214 Args:215 file_path (str): The path to the file to check.216
217 Returns:218 bool: True if the file was empty and removed, False otherwise.219 """220 if os.path.exists(file_path):221 if os.path.getsize(file_path) == 0:222 # File size is 0 bytes223 try:224 os.remove(file_path)225 print(f"Removed empty file: {file_path}")226 return True227 except OSError as e:228 print(f"Error removing file: {e}")229 return False230 else:231 try:232 if os.path.getsize(file_path) > 10000000:233 # Open only files less than 10 Mb234 return False235 with open(file_path, 'r') as f:236 data = json.load(f)237 except Exception as Ex:238 print(f'----------------------------------- Exception: {Ex} -----------------------------------\n file_path: {file_path}\n ')239 with open(file_path, 'a') as f:240 f.write(",{}]")241 with open(file_path, 'r') as f:242 data = json.load(f)243 if not data:244 # File is empty245 try:246 os.remove(file_path)247 print(f"Removed empty file: {file_path}")248 return True249 except OSError as e:250 print(f"Error removing file: {e}")251 return False252 else:253 print(f"File is not empty or does not exist: {file_path}")254 return False255
256
257# def get_resume_urls(domain_name_to_resume_from):258# '''259# -------------260# pseudocode:261# -------------262
263# domain_name_to_resume_from = ekantipur264
265# while there is file ['ekantipur.json', 'ekantipur_0.json', 'ekantipur_1.json', ...]:266# urls_to_visit.add(to_visited_urls)267# urls_visited.add(visited_urls)268
269# fro url in urls_to_visit:270# add url to news_urls if url is not present in urls_visited271
272# return list(news_urls)273# '''274
275# new_file_name = domain_name_to_resume_from + '.json'276# remove_file_if_empty(new_file_name)277
278# urls_to_visit = set()279# urls_visited = pybloom_live.ScalableBloomFilter(280# mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)281# news_start_ulrs = set()282# index = 0283 284# # load data from merged file285# if os.path.exists(domain_name_to_resume_from + '_merged_.json'): 286# if not remove_file_if_empty(new_file_name):287# # File not empty288# with open(domain_name_to_resume_from + '_merged_.json','r') as f:289# data = json.load(f)290# if data:291# for each_data in data:292# if type(each_data) == dict and 'to_visit' in each_data.keys():293# urls_to_visit.add(294# each_data['to_visit']) # each_data['to_visit'] is a of urls295# if type(each_data) == dict:296# if 'visited' in each_data.keys():297# urls_visited.add(298# each_data['visited']) # each_data['visited'] is a url299
300# while os.path.exists(new_file_name):301# if remove_file_if_empty(new_file_name):302# break303# try:304# with open(new_file_name, 'r') as file:305# data = json.load(file)306# except:307# '''308# file is corrupt when scrapy is terminated while it is still crawling.309# while corrupt, file is terminated with: `{some_data},`310# adding : `""]` at the end of file to make it valid json file311
312# '''313# with open(new_file_name, 'a') as file:314# file.write("\"\"]")315
316# with open(new_file_name, 'r') as file:317# data = json.load(file)318# for each_data in data:319# if type(each_data) == dict and 'to_visit' in each_data.keys():320# urls_to_visit.add(321# each_data['to_visit']) # each_data['to_visit'] is a of urls322
323# if type(each_data) == dict:324# if 'visited' in each_data.keys():325# urls_visited.add(326# each_data['visited']) # each_data['visited'] is a url327
328# new_file_name = domain_name_to_resume_from + f'_{index}.json'329# # remove file if file is empty330# remove_file_if_empty(new_file_name)331
332# index += 1333
334# print(f'\n\n new_file_name from function:{new_file_name}')335# for to_visit in urls_to_visit:336# if to_visit not in urls_visited:337# news_start_ulrs.add(to_visit)338
339# return list(news_start_ulrs), urls_visited340
341
342def is_nepali_language(text):343 lang, confidence = langid.classify(text)344 return lang == 'hi', confidence345 # is_nepali("बल्ल बुझायो एनसेलले खरिद-बिक्री विवरण")346
347
348def is_google_drive_link(link):349 # return id of drive link350
351 if link.startswith('https://drive.google.com/file/d/'):352 drive_id = link.split('https://drive.google.com/file/d/')[1].split(353 '/')[0].split('?')[0]354 elif link.startswith('https://drive.google.com/drive/folders/'):355 drive_id = link.split('https://drive.google.com/drive/folders/')[1].split(356 '/')[0].split('?')[0]357 elif link.startswith('https://drive.google.com/open?id='):358 drive_id = link.split('https://drive.google.com/open?id=')[1].split(359 '/')[0].split('?')[0]360 elif link.startswith('https://drive.google.com/drive/u/0/folders/'):361 drive_id = link.split('https://drive.google.com/drive/u/0/folders/'362 )[1].split('/')[0].split('?')[0]363 elif link.startswith('https://drive.google.com/drive/u/1/folders/'):364 drive_id = link.split('https://drive.google.com/drive/u/1/folders/'365 )[1].split('/')[0].split('?')[0]366 elif link.startswith('https://drive.google.com/drive/u/2/folders/'):367 drive_id = link.split('https://drive.google.com/drive/u/2/folders/'368 )[1].split('/')[0].split('?')[0]369 elif link.startswith('https://drive.google.com/drive/u/3/folders/'):370 drive_id = link.split('https://drive.google.com/drive/u/3/folders/'371 )[1].split('/')[0].split('?')[0]372 elif link.startswith('https://drive.google.com/drive/u/0/file/d'):373 drive_id = link.split('https://drive.google.com/drive/u/0/file/d/'374 )[1].split('/')[0].split('?')[0]375 elif link.startswith('https://drive.google.com/drive/u/1/file/d/'):376 drive_id = link.split('https://drive.google.com/drive/u/1/file/d/'377 )[1].split('/')[0].split('?')[0]378 elif link.startswith('https://drive.google.com/drive/u/2/file/d/'):379 drive_id = link.split('https://drive.google.com/drive/u/2/file/d/'380 )[1].split('/')[0].split('?')[0]381 elif link.startswith('https://drive.google.com/drive/u/3/file/d/'):382 drive_id = link.split('https://drive.google.com/drive/u/3/file/d/'383 )[1].split('/')[0].split('?')[0]384 elif link.startswith('https://drive.google.com/drive/mobile/folders/'):385 drive_id = link.split('https://drive.google.com/drive/mobile/folders/'386 )[1].split('/')[0].split('?')[0]387 elif link.startswith('https://drive.google.com/folderview?id='):388 drive_id = link.split('https://drive.google.com/folderview?id=')[1].split(389 '/')[0].split('?')[0]390 else:391 # return original link392 return False, link393
394 return True, drive_id395
396
397def is_same_domain(url1, url2):398 return urlparse(url1).netloc == urlparse(url2).netloc399
400
401def is_np_domain(url):402 # return true if it is one of first 1000 .np domain403 parsed_url = urlparse(url)404 base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"405
406 # crawl it if .np domain and len(visited_urls_base) < 1000407
408 return base_url.endswith('.np') or base_url.endswith('.np/')409
410def is_special_domain_to_crawl(url):411 # crawl some non '.np' domains412 return url.startswith("https://www.bbc.com/nepali/") or url.startswith("https://np.usembassy.gov/ne/")413
414def is_document_or_media(url):415 is_social_media, _ = is_social_media_link(url)416 is_document, _, _ = is_document_link(url)417 if is_social_media or is_document:418 return True419 return False420def should_we_crawl_it(new_url, visited_urls_base=None):421 # return true if it is one of first 1000 .np domain422 parsed_url = urlparse(new_url)423 base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"424
425 # crawl it if .np domain and len(visited_urls_base) < 50426
427 if base_url[-3:] == '.np' and (visited_urls_base == None428 or len(visited_urls_base) < 50):429 return base_url, True430 else:431 return base_url, False432
433
434# import re435def is_social_media_link(link):436 social_media_patterns = {437 'facebook': r"facebook\.com",438 'instagram': r"instagram\.com",439 'twitter': r"twitter\.com",440 'x': r"x\.com",441 'tiktok': r"tiktok\.com",442 'linkedin': r"linkedin\.com",443 'pinterest': r"pinterest\.com",444 'youtube': r"youtube\.com|youtu\.be",445 'reddit': r"reddit\.com",446 'snapchat': r"snapchat\.com",447 'quora': r"quora\.com",448 'tumblr': r"tumblr\.com",449 'flickr': r"flickr\.com",450 'medium': r"medium\.com",451 'vimeo': r"vimeo\.com",452 'vine': r"vine\.co",453 'periscope': r"periscope\.tv",454 'meetup': r"meetup\.com",455 'mix': r"mix\.com",456 'soundcloud': r"soundcloud\.com",457 'behance': r"behance\.net",458 'dribbble': r"dribbble\.com",459 'vk': r"vk\.com",460 'weibo': r"weibo\.com",461 'ok': r"ok\.ru",462 'deviantart': r"deviantart\.com",463 'slack': r"slack\.com",464 'telegram': r"t\.me|telegram\.me",465 'whatsapp': r"whatsapp\.com",466 'line': r"line\.me",467 'wechat': r"wechat\.com",468 'kik': r"kik\.me",469 'discord': r"discord\.gg",470 'skype': r"skype\.com",471 'twitch': r"twitch\.tv",472 'myspace': r"myspace\.com",473 'badoo': r"badoo\.com",474 'tagged': r"tagged\.com",475 'meetme': r"meetme\.com",476 'xing': r"xing\.com",477 'renren': r"renren\.com",478 'skyrock': r"skyrock\.com",479 'livejournal': r"livejournal\.com",480 'fotolog': r"fotolog\.com",481 'foursquare': r"foursquare\.com",482 'cyworld': r"cyworld\.com",483 'gaiaonline': r"gaiaonline\.com",484 'blackplanet': r"blackplanet\.com",485 'care2': r"care2\.com",486 'cafemom': r"cafemom\.com",487 'nextdoor': r"nextdoor\.com",488 'kiwibox': r"kiwibox\.com",489 'cellufun': r"cellufun\.com",490 'tinder': r"tinder\.com",491 'bumble': r"bumble\.com",492 'hinge': r"hinge\.co",493 'match': r"match\.com",494 'okcupid': r"okcupid\.com",495 'zoosk': r"zoosk\.com",496 'plentyoffish': r"pof\.com",497 'eharmony': r"eharmony\.com",498 'coffee_meets_bagel': r"coffeemeetsbagel\.com",499 'her': r"weareher\.com",500 'grindr': r"grindr\.com",501 'happn': r"happn\.com",502 'hily': r"hily\.com",503 'huggle': r"huggle\.com",504 'jdate': r"jdate\.com",505 'lovoo': r"lovoo\.com",506 'meetmindful': r"meetmindful\.com",507 'once': r"once\.com",508 'raya': r"raya\.app",509 'ship': r"getshipped\.com",510 'silversingles': r"silversingles\.com",511 'tastebuds': r"tastebuds\.fm",512 'the_league': r"theleague\.com",513 'tudder': r"tudder\.com",514 'twoo': r"twoo\.com",515 }516
517 for social_media, pattern in social_media_patterns.items():518 if (re.search(r'https://www\.' + pattern, link, flags=re.IGNORECASE)) or (519 (re.search(r'https://www\.m\.' + pattern, link, flags=re.IGNORECASE))):520 return True, social_media521
522 return False, None523
524
525# is_social_media_link('https://www.facebook.com/sth.s/ste') # (True, 'facebook')526# is_social_media_link('https://www.m.youtube.com/')# (True, 'youtube') # (True, 'youtube')527# is_social_media_link('https://www.google.com/search?q=youtube.com') # (False, None)528
529
530def is_document_link(link):531 # document extensions532 document_extensions = [533 '.pdf', '.odt', '.docx', '.doc', '.pst', '.ai', '.drw', '.dxf', '.eps',534 '.ps', '.svg', '.cdr', '.odg'535 ]536 # presentation extensions537 presentation_extensions = ['.ppt', '.pptx', '.odp', '.pps']538 # spreadsheet extensions539 spreadsheet_extensions = ['.xls', '.xlsx', '.ods']540 # archive extensions541 archive_extensions = [542 '.zip', '.rar', '.tar', '.gz', '.7z', '.7zip', '.bz2', '.tar.gz', '.xz'543 ]544 # video extensions545 video_extensions = [546 '.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.mng', '.mpg', '.qt',547 '.rm', '.swf', '.m4v', '.webm'548 ]549 # audio extensions550 audio_extensions = [551 '.mp3', '.wav', '.ogg', '.wma', '.ra', '.aac', '.mid', '.au', '.aiff',552 '.3gp', '.asf', '.asx', '.m4a'553 ]554 # image extensions555 image_extensions = [556 '.bmp', '.gif', '.jpg', '.jpeg', '.png', '.tif', '.tiff', '.ico',557 '.webp', '.mng', '.pct', '.psp'558 ]559 # other extensions560 other_extensions = ['.css', '.exe', '.bin', '.rss', '.dmg', '.iso', '.apk']561 562 extensions = {563 'document': document_extensions,564 'presentation': presentation_extensions,565 'spreadsheet': spreadsheet_extensions,566 'archive': archive_extensions,567 'video': video_extensions,568 'audio': audio_extensions,569 'image': image_extensions,570 'other': other_extensions571 }572 573 for doc_type, extension_list in extensions.items():574 for extension in extension_list:575 if link.lower().endswith(extension):576 return True, doc_type, (577 doc_type == 'other'578 ) # doc_type == 'image' ignore image or other docs579 return False, None, True580
581
582# is_document_link('https://acem.edu.np/uploads/userfiles/files/old_question/8th%20Semester/BCTBEXBEL/Wind%20Energy%202.PDF')583# is_document, document_type = is_document_link('https://www.google.com/link.pdf')
src/spiders/mongo.py
1import csv2import os3import time4from pymongo.server_api import ServerApi5
6from dotenv import load_dotenv7load_dotenv()8
9# Creating mangodb10from pymongo import MongoClient11
12class Mongo():13 def __init__(self, db_name='scrapy-engine', collection_name="urls-collection", local=False):14 if local:15 client = MongoClient('localhost', 27017)16 self.db = client[db_name]17 self.collection = client[db_name][collection_name]18 19 # Ensure Index is Created for faster search operations20 self.collection.create_index('url', unique=True)21 else:22 23 uri = f"mongodb+srv://jokeleopedia:{os.environ.get('mongo_password')}@scrapy-engine.5cqch4y.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"24 # uri = f"mongodb+srv://{os.environ.get('mongo_username')}:{os.environ.get('mongo_password')}@scrapy-engine.xwugtdk.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"25 # uri = f"mongodb+srv://{os.environ.get('user_heroku')}:{os.environ.get('pass_heroku')}@cluster0.dgeujbs.mongodb.net/?retryWrites=true&w=majority"26 27 # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient28 client = MongoClient(uri, server_api=ServerApi('1'))29 30 # Create the database for our example (we will use the same database throughout the tutorial31 self.db = client[db_name]32 self.collection = self.db[collection_name] # using single collection for all urls33 # # one time operation34 # self.collection.create_index('url', unique=True)35 36 def append_error_data(self, data):37 '''38 # e.g. first add new data and update it39 40 import time41 from mongo import Mongo42 mongo = Mongo()43
44 error_data = {'url':'https://google.com', 'timestamp':time.time(), 'status':'error', 'status_code':400, 'error_type':'HttpError'}45 error_data2 = {'url':'https://google.com', 'timestamp':'now', 'status':'not-error', 'status_code':'errororrororo', 'error_type':'HttpError'}46
47 # insert error_data48 mongo.db['test'].update_one(49 {'url': error_data['url']},50 {'$set': error_data}, # data is dict with error info51 upsert=True52 )53 list(mongo.db['test'].find({'url':'https://google.com'})) # output: [{'_id': ObjectId('66a9c58c0bee3830ff4dcba3'), 'url': 'https://google.com', 'error_type': 'HttpError', 'status': 'error', 'status_code': 400, 'timestamp': 1722402186.1221087}]54
55 # update with error_data256 mongo.db['test'].update_one(57 {'url': error_data2['url']},58 {'$set': error_data2}, # data is dict with error info59 upsert=True60 )61 list(mongo.db['test'].find({'url':'https://google.com'})) # output: [{'_id': ObjectId('66a9c58c0bee3830ff4dcba3'), 'url': 'https://google.com', 'error_type': 'HttpError', 'status': 'not-error', 'status_code': 'errororrororo', 'timestamp': 'now'}]62 '''63
64 # upsert the error data65 self.collection.update_one(66 {'url': data['url']},67 {'$set': data}, # data is dict with error info68 upsert=True69 )70 71 # Delete url if it's status is either 'to_crawl' or crawling72 # if status is crawled, try/except should avoid creating error 73 # results = list(self.collection.find({'url':data['url'], 'status': {'$in': ['to_crawl', 'crawling']}}))74 # if results:75 # self.collection.delete_one({'_id':results[0]['_id']})76 77 # try:78 # # Try inserting url79 # self.collection.insert_one(data)80 # return True # inserted81 # except Exception as ex:82 # print(ex)83 # return # error data exists84
85 def append_url_crawled(self, url):86 try:87 # Try inserting url88 self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'crawled'})89 except Exception as ex:90 # url exists: change status to 'crawled'91 self.collection.update_one({'url':url}, {'$set': {'status':'crawled'}})92
93 def append_url_crawling(self, url):94 try:95 # Try inserting url96 self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'crawling'})97 return True # inserted98 except Exception as ex:99 # modify to_crawl to crawling100 result = self.collection.update_one(101 {'url': url, 'status': {'$in': ['to_crawl']}},102 {'$set': {'status':'crawling'}}, 103 )104 if result.modified_count > 0:105 # print("url was in to_crawl and now changed to crawling")106 return True107 else:108 # print("it is either already crawling or crawled")109 return False110 111 def append_url_to_crawl(self, url):112 if type(url) == list:113 # Insert multiple urls114 '''115 ordered=False: If an error occurs during the processing of one of the write operations, MongoDB will continue to process remaining write operations in the queue.116 '''117 try:118 self.collection.insert_many([{'url':u, 'timestamp':time.time(), 'status':'to_crawl?'} for u in url], ordered=False)119 except Exception as ex:120 pass121 # print(ex)122 elif type(url) == str:123 try:124 # Try inserting url125 self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'to_crawl?'})126 return True # inserted127 except Exception as ex:128 pass129 # print(ex) # url exists130 131 def check_connection(self): 132 # Create a new client and connect to the server133 uri = f"mongodb+srv://jokeleopedia:{os.environ.get('mongo_password')}@scrapy-engine.5cqch4y.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"134 client = MongoClient(uri, server_api=ServerApi('1'))135 # ping to confirm a successful connection136 try:137 client.admin.command('ping')138 print("Pinged your deployment. You successfully connected to MongoDB!")139 except Exception as e:140 print(e)141 142 def delete_to_crawl(self, url):143 try:144 self.collection.delete_one({'url':url, 'status':'to_crawl'})145 except Exception as ex:146 pass147 print(ex)148 149 def export_local_mongo(self):150 '''151 # Export the data from local_mongo to a csv file 'local_mongo.csv'152 with headers: 'url', 'timestamp', 'status'153
154
155 # Comments156 * 10K at a time was very slow. 100K at a time is still slow though..157 '''158 def save_to_csv(data, csv_file_path='local_mongo.csv', field_names=['url', 'timestamp', 'status']):159 file_exists = os.path.exists(csv_file_path)160 with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:161 # Create a CSV writer object162 csv_writer = csv.DictWriter(csvfile, fieldnames=field_names)163 164 # If the file doesn't exist, write the header165 if not file_exists:166 print('writing header')167 csv_writer.writeheader()168 169 # Write the data to the CSV file170 try:171 # Save data to csv file172 csv_writer.writerows(data)173 except Exception as ex:174 print(ex)175 # save_to_csv(entries)176 177 entries_count = local_mongo.collection.count_documents({})178 print(f'Exporting {entries_count} entries')179 # get all entries 10000 at a time180 for i in range(0, entries_count, 100000):181 entries = list(self.collection.find({}).skip(i).limit(100000))182 entries = [{'url':entry['url'], 'timestamp':entry['timestamp'], 'status':entry['status']} for entry in entries]183 print(f'Exporting {i} to {i+100000} entries. step: {int(i/100000)}/{int(entries_count/100000)}')184 save_to_csv(entries)185 186 def import_to_local_mongo(self):187 '''188 # Load the data from local_mongo.csv to local_mongo189 * 10K at a time190 '''191 def load_from_csv(csv_file_path='local_mongo.csv', field_names=['url', 'timestamp', 'status']):192 def save_to_mongo(self, entries):193 # Insert multiple data194 '''195 ordered=False: If an error occurs during the processing of one of the write operations, MongoDB will continue to process remaining write operations in the queue.196 '''197 try:198 # self.collection.insert_many([{'url':data['url'], 'timestamp':data['timestamp'], 'status':data['status']} for data in datas], ordered=False)199 self.collection.insert_many(entries, ordered=False)200 except Exception as ex:201 pass202 # print(ex)203 data = []204 import_count = 0205 with open(csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:206 # Create a CSV reader object207 csv_reader = csv.DictReader(csvfile)208 for row in csv_reader:209 '''210 format of data is:211 [212 {213 'url': 'https://psc.gov.np/others/4/d46d4ca7616bcde106e08f4b8636a247e84450197b6062248f2cb8cd2f550277b55612bd7320cd3f2d7aec2ada0707ba992e0a2bfb66a7154cf921a2ae37028964.NDLeVePim0d~oLhNjuuA4I~L7sdJOUPqysLgcC6c-.html',214 'timestamp': '1716021918.59285',215 'status': 'crawled'216 }, 217 ...218 ]219 ''' 220 data.append(row)221 if len(data) > 10000:222 # Save to mongo223 save_to_mongo(self, data)224 data = [] # re-initialize data225 import_count += 10000226 print(f'Import count: {import_count} iteration: {int(import_count/10000)}')227 # return data228 # load_from_csv()229 load_from_csv()230
231 def recover_expired_crawling(self, created_before=7200):232 # def convert_from_crawling_to_to_crawl(urls):233 # # for url in urls:234 # # self.collection.update_one(235 # # {'_id':url['_id'], 'status': {'$in': ['crawling']}},236 # # {'$set': {'status':'to_crawl'}}237 # # )238 # # perform bulk update239 # if urls:240 # self.collection.update_many(241 # {'_id': {'$in': [url['_id'] for url in urls]}},242 # {'$set': {'status':'to_crawl'}}243 # )244
245 # get items with status 'crawling' and before timestamp 2 hours (7200)246 timestamp = time.time() - created_before # 2 hours ago247 248 pipeline = [249 {"$match": {"status": "crawling", "timestamp": {"$lt": str(timestamp)}}},250 # {"$count": "count"}251 ]252 # The result is a list of documents returned by the aggregation pipeline253 expired_crawling_urls = list(self.collection.aggregate(pipeline))254 return expired_crawling_urls255 # convert_from_crawling_to_to_crawl(expired_crawling_urls)256
257 258 def fetch_start_urls(self, number_of_urls_required=10):259 # return [json.loads(url) for url in self.redis_client.srandmember('urls_to_crawl_cleaned_set', number_of_new_urls_required)]260 261 # Get all entries with status 'to_crawl'262 random_urls = list(self.collection.aggregate([263 {"$match": {"status": "to_crawl"}},264 {"$sample": {"size": number_of_urls_required}}265 ]))266 # urls = list(self.collection.find({'status':'to_crawl'}).limit(number_of_urls_required))267
268 # perform bulk update269 if random_urls:270 self.collection.update_many(271 {'_id': {'$in': [url['_id'] for url in random_urls]}},272 {'$set': {'status':'crawling'}}273 )274 275 return random_urls276 def fetch_all(self):277 return list(self.collection.find())278 279 def set_configs(self, configs):280 '''281 # Tests:282 * variable types seems to be preserved283 * if key exists, value is updated otherwise new key-value pair is created284 # Format of configs285 ```286 configs = [287 {'crawl_other_data': False},288 {'crawl_paragraph_data':True},289 {'some_config':1000}290 ]291 ```292 '''293
294 for config in configs:295 self.db['config'].update_one(296 {'name': list(config.keys())[0]},297 {'$set': {'value': list(config.values())[0]}},298 upsert=True299 )300 def get_configs(self):301 '''302 # Returns data of format:303 ```304 {305 'crawl_other_data': False,306 'crawl_paragraph_data': True307 }308 ```309 '''310 configs_data = self.db['config'].find({})311 configs = {}312 for config in configs_data:313 configs[config['name']] = config['value']314 return configs315
316if __name__=="__main__":317 # delete all data and create unique index for field: 'url'318 mongo = Mongo()319 320 collection_names = ['url_crawled', 'url_to_crawl', 'url_crawling']321 for collection_name in collection_names:322 mongo.db_handler.delete_all(collection_name=collection_name)323 mongo.db_handler.db[collection_name].create_index('url', unique=True)324
325 326 327
328 # # local mongo329 # # from mongo import Mongo330 # local_mongo = Mongo(local=True)331
332 # # Get all entries with limit 10333 # print(local_mongo.find({}).limit(10))334
335 # entries_count = local_mongo.collection.count_documents({})336 # # get all entries 10000 at a time337 # for i in range(0, entries_count, 10000):338 # entries = list(local_mongo.find({}).skip(i).limit(10000))339 # append_to_csv(entries)
src/spiders/quotes_spider.py
1import scrapy2# scrapy crawl quotes_spider -o quotes.json3class QuotesSpider(scrapy.Spider):4 name = "quotes_spider"5 start_urls = [6 "https://quotes.toscrape.com/page/1/",7 "https://quotes.toscrape.com/page/2/",8 # "https://quotes.toscrape.com/page/3/",9 # "https://quotes.toscrape.com/page/4/",10 # "https://quotes.toscrape.com/page/5/",11 # "https://quotes.toscrape.com/page/6/",12 # "https://quotes.toscrape.com/page/7/",13 # "https://quotes.toscrape.com/page/8/",14 # "https://quotes.toscrape.com/page/9/",15 # "https://quotes.toscrape.com/page/10/",16 ]17
18 def parse(self, response):19 import os;print('\n\n\n\n', os.listdir(), os.environ.get('some_key2'), '\n\n')20 # print(f"\n before {self.crawler.engine.slot.scheduler.stats._stats['scheduler/enqueued']} \n\n")21 print(f"\n before2 {len(self.crawler.engine.slot.inprogress)}") # dont know why it always returns zero22 print(self.crawler.engine.settings.get('CONCURRENT_REQUESTS'))23 print(len(self.crawler.engine.slot.inprogress) >= self.crawler.engine.settings.get('CONCURRENT_REQUESTS'))24 for quote in response.css("div.quote"):25 yield {26 "text": quote.css("span.text::text").get(),27 "author": quote.css("small.author::text").get(),28 "tags": quote.css("div.tags a.tag::text").getall(),29 }30 # next_page = response.css("li.next a::attr(href)").get()31 # if next_page is not None:32 # next_page = response.urljoin(next_page)33 # yield scrapy.Request(next_page, callback=self.parse)34 # print(f"\n After {self.crawler.engine.slot.scheduler.stats._stats['scheduler/enqueued']} \n\n")35 # print(f"\n after2 {len(self.crawler.engine.slot.scheduler)}") # dont know why it always returns zero36 def spider_closed(self, spider, reason):37 print(f"almost sakyo muji. reason:{reason}")38 39 # Restart spider with more start urls40 new_urls = get_new_start_urls()41 for url in new_urls:42 yield scrapy.Request(url, callback=self.parse)43
44all_start_urls = [45 # "https://quotes.toscrape.com/page/1/",46 # "https://quotes.toscrape.com/page/2/",47 "https://quotes.toscrape.com/page/3/",48 "https://quotes.toscrape.com/page/4/",49 "https://quotes.toscrape.com/page/5/",50 "https://quotes.toscrape.com/page/6/",51 "https://quotes.toscrape.com/page/7/",52 "https://quotes.toscrape.com/page/8/",53 "https://quotes.toscrape.com/page/9/",54 "https://quotes.toscrape.com/page/10/"55 ]56
57def get_new_start_urls():58 new_start_urls = random.sample(all_start_urls, 2)59 for url in new_start_urls:60 all_start_urls.remove(url)61 return new_start_urls
src/spiders/title.py
1from __future__ import annotations2
3from typing import Generator4from urllib.parse import urljoin5
6from scrapy import Request, Spider7from scrapy.responsetypes import Response8
9from ..items import TitleItem10
11
12class TitleSpider(Spider):13 """14 Scrapes title pages and enqueues all links found on the page.15 """16
17 name = 'title_spider'18
19 # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input20 # when the project is executed using Apify.21 start_urls = ['https://apify.com/']22
23 def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:24 """25 Parse the web page response.26
27 Args:28 response: The web page response.29
30 Yields:31 Yields scraped TitleItem and Requests for links.32 """33 self.logger.info('TitleSpider is parsing %s...', response)34 self.logger.info('TitleSpider is parsing %s...', response)35
36 # Extract and yield the TitleItem37 url = response.url38 title = response.css('title::text').extract_first()39 yield TitleItem(url=url, title=title)40
41 # Extract all links from the page, create Requests out of them, and yield them42 for link_href in response.css('a::attr("href")'):43 link_url = urljoin(response.url, link_href.get())44 if link_url.startswith(('http://', 'https://')):45 yield Request(link_url)
src/spiders/worker_spider_v2.py
1from .functions import is_social_media_link, is_document_link, is_google_drive_link, is_same_domain, is_np_domain,is_special_domain_to_crawl, load_env_var_in_google_colab, remove_fragments_from_url, is_nepali_language, is_valid_text_naive, is_document_or_media2# import pybloom_live3import scrapy4import dotenv5# import json6# import os7# import redis8# import threading9import time10
11# from scrapy import signals# , Spider12from scrapy.linkextractors import LinkExtractor13from .mongo import Mongo14
15from scrapy.spidermiddlewares.httperror import HttpError16from twisted.internet.error import DNSLookupError, TCPTimedOutError, TimeoutError17
18
19class WorkerSpider(scrapy.Spider):20 name = "worker_spider"21 crawled_data = []22 to_visit = []23 other_data = []24 25 26 def __init__(self, *args, **kwargs):27 # load environment variables if running in google colab28 load_env_var_in_google_colab()29 # dotenv.load_dotenv("server/.env")30
31 # super(EkantipurSpider, self).__init__(*args, **kwargs)32 self.mongo = Mongo()33 34 # ----------------------------------------------------------------------------35 # config variable from mongo db to decide whether or not to crawl other_data36 print('------------------------------------')37 38 # Get configs39 configs = self.mongo.get_configs()40 41 # default: Do not Crawl other data42 self.crawl_other_data = configs['crawl_other_data'] if 'crawl_other_data' in configs else False43 print(f'config.crawl_other_data: {self.crawl_other_data}')44 45 # default: Crawl paragraph data46 self.crawl_paragraph_data = configs['crawl_paragraph_data'] if 'crawl_paragraph_data' in configs else True47 print(f'config.crawl_paragraph_data: {self.crawl_paragraph_data}')48 print('------------------------------------')49 # ----------------------------------------------------------------------------50
51 # self.redis_client = redis.Redis(52 # host=os.environ.get('REDIS_HOST', 'localhost'),53 # port = int(os.environ.get('REDIS_PORT', 6379)),54 # password=os.environ.get('REDIS_PASSWORD', None),55 # )56 57 # self.start_urls = self.fetch_start_urls()58 # self.start_urls = [data_item['url'] for data_item in self.mongo.fetch_start_urls()]59 # print(f'\n\n start_urls: {self.start_urls} \n\n')60 # for url in self.start_urls:61 # yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin) # , dont_filter=True : allows visiting same url again62
63 # using bloom filter to store visited urls for faster lookup64 # self.visited_urls = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH) # pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH means65 66 # def fetch_start_urls(self, number_of_new_urls_required=10):67 # return [json.loads(url) for url in self.redis_client.srandmember('urls_to_crawl_cleaned_set', number_of_new_urls_required)]68 def start_requests(self):69 n_concurrent_requests = self.crawler.engine.settings.get('CONCURRENT_REQUESTS')70 start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(n_concurrent_requests)]71 print(f'\n\n start:{start_urls} \n\n')72 for url in start_urls:73 yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin) # , dont_filter=True : allows visiting same url again74 def parse(self, response):75 if 'text/html' not in response.headers.get('Content-Type', '').decode('utf-8'):76 # response is not text/html (its probably some document)77 # save the link to mongo78 self.mongo.db['other_data'].insert_one(79 {80 'url': response.url,81 'Content-Type': response.headers.get('Content-Type', '').decode('utf-8'),82 }83 )84 return85 links = LinkExtractor(deny_extensions=[]).extract_links(response)86 drive_links = [] # Drive links87 site_links = [] # website links88 89 # yield every heading, paragraph from current page90 headings_and_paragraphs = response.css('h1::text, h2::text, h3::text, h4::text, h5::text, h6::text, p::text').getall()91 if self.crawl_paragraph_data:92 # print('-------------------------------------------------')93 # print(f'\ncrawling paragraph_data: {self.crawl_paragraph_data}')94 # print('-------------------------------------------------')95 # Crawl Paragraph Data only if config is set to True96 the_crawled_data = [] # list for bulk upload97 for paragraph in headings_and_paragraphs:98 is_nepali, confidence = is_nepali_language(paragraph)99 if is_nepali and is_valid_text_naive(paragraph): # implement a way to detect nepali language (langid is not reliable)100 # Save crawled data to redis101 # self.crawled_data.append(102 the_crawled_data.append({103 'parent_url': response.url,104 'page_title': response.css('title::text').get(),105 'paragraph': paragraph,106 # 'is_nepali_confidence': confidence107 })108 # self.mongo.db['crawled_data'].insert_one(the_crawled_data)109 '''110 # Old Code using Redis: Redis turned out to be too slow for concurrent processes111 self.redis_client.lpush('crawled_data', json.dumps(the_crawled_data))112 '''113 if the_crawled_data:self.mongo.db['crawled_data'].insert_many(the_crawled_data)114 else:115 pass116 # print('-------------------------------------------------')117 # print(f'Avoided crawling paragraph_data: {not self.crawl_paragraph_data}')118 # print('-------------------------------------------------')119 # Saving drive links120 if self.crawl_other_data:121 # print('-------------------------------------------------')122 # print(f'\ncrawling_other_data: {self.crawl_other_data}')123 # print('-------------------------------------------------')124 # config['crawl_other_data'] is True125 # So crawl other data126 the_other_data = [] # list for bulk upload127 for link in links:128 is_drive_link, _ = is_google_drive_link(link.url) #DriveFunctions.is_google_drive_link(link.url)129 if is_drive_link:130 the_other_data.append({131 'parent_url': response.url,132 'url': link.url,133 'text': link.text,134 'link_type': "drive_link",135 'link_description': None136 })137 # self.mongo.db['other_data'].insert_one(the_other_data)138 '''139 # Old Code using Redis: Redis turned out to be too slow for concurrent processes140 # # self.other_data.append({141 self.redis_client.lpush('other_data', json.dumps(the_other_data))142 '''143 else:144 is_document, document_type, ignore_doc = is_document_link(link.url)145 if is_document:146 if not ignore_doc: # and doc_type != 'image'147 the_other_data.append({148 'parent_url': response.url,149 'url': link.url,150 'text': link.text,151 'link_type': "document",152 'link_description': document_type153 })154 # self.mongo.db['other_data'].insert_one(the_other_data)155
156 '''157 # Old Code using Redis: Redis turned out to be too slow for concurrent processes158 # self.other_data.append({159 self.redis_client.lpush('other_data', json.dumps(the_other_data))160 '''161 else:162 is_social_media, social_media_type = is_social_media_link(link.url)163 if is_social_media:164 the_other_data.append({165 'parent_url': response.url,166 'url': link.url,167 'text': link.text,168 'link_type': "social_media",169 'link_description': social_media_type170 })171 # self.mongo.db['other_data'].insert_one(the_other_data)172 '''173 # Old Code using Redis: Redis turned out to be too slow for concurrent processes174 self.redis_client.lpush('other_data', json.dumps())175 '''176 else:177 site_links.append(link)178 if the_other_data: self.mongo.db['other_data'].insert_many(the_other_data)179 else:180 # print('-------------------------------------------------')181 # print(f'Avoided crawling_other_data: {not self.crawl_other_data}')182 # print('-------------------------------------------------')183 # config['crawl_other_data'] is False184 # So do not crawl other data185 for link in links:186 # To avoid crawling documents or social media links187 188 if not is_document_or_media(link.url):189 site_links.append(link)190
191 # Next Page to Follow: 192 the_to_crawl_urls = [] # list for bulk upload193 for site_link in site_links:194 # print(f' \n muji site link: {site_link.url} \n')195 # base_url, crawl_it = should_we_crawl_it(site_link.url) # self.visited_urls_base,196 de_fragmented_url = remove_fragments_from_url(site_link.url)197 if (is_np_domain(de_fragmented_url) or is_special_domain_to_crawl(de_fragmented_url)): # and de_fragmented_url not in self.visited_urls:198 # is_same_domain(response.url, de_fragmented_url) or # it is a problem because attempting to crawl bbc.com/nepali can lead to bbc.com199 # only follow urls from same domain or other nepali domain and not visited yet200 # if len(self.crawler.engine.slot.inprogress) >= self.crawler.engine.settings.get('CONCURRENT_REQUESTS'):201 # # send new urls_to_crawl to redis.202 # # self.redis_client.sadd('url_to_crawl', json.dumps(de_fragmented_url))203 # # self.mongo.append_url_to_crawl(de_fragmented_url)204 205 # self.mongo.append_url_to_crawl(de_fragmented_url)206 if len(de_fragmented_url.strip()) > 0 and len(de_fragmented_url) < 1000:207 # avoid urls with length greater than 1000208 the_to_crawl_urls.append(de_fragmented_url)209 210 # else:211 # if self.mongo.append_url_crawling(de_fragmented_url):212 # # yield url to crawl new urls_to_crawl by itself213 # yield scrapy.Request(de_fragmented_url, callback=self.parse, errback=self.errback_httpbin) # dont_filter=True : allows visiting same url again214 215 # # self.to_visit.append(de_fragmented_url)216 217 # # Send crawling notice to server218 # # self.redis_client.sadd('url_crawling', json.dumps(de_fragmented_url))219 if the_to_crawl_urls: self.mongo.append_url_to_crawl(the_to_crawl_urls)220 # append crawled url to visited urls221 # self.mongo.delete_to_crawl(response.request.url)222 ''' Note:223 If a url redirects to another url, then the original url is added to visited urls so as to not to visit it again.224 redirected url is sent via crawled_data and other_data then update visited_url.225 '''226
227 # Keep the worker buisy by getting new urls to crawl228 # Get few more start urls to crawl next229 number_of_new_urls_required = self.crawler.engine.settings.get('CONCURRENT_REQUESTS') - len(self.crawler.engine.slot.inprogress)230 if number_of_new_urls_required > 0:231 n_concurrent_requests = self.crawler.engine.settings.get('CONCURRENT_REQUESTS')232 fetched_start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(n_concurrent_requests)]233 # fetched_start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(number_of_new_urls_required)]234 if fetched_start_urls:235 for url in fetched_start_urls:236 # if url not in self.visited_urls: # not necessary as we are fetching from mongo237 # self.visited_urls.add(url)238 # print(url)239 yield scrapy.Request(url, callback=self.parse)240
241 def errback_httpbin(self, failure):242 # log all failures243 self.logger.error(repr(failure))244
245 if failure.check(HttpError):246 response = failure.value.response247 self.logger.error('HttpError on %s', response.url)248
249 error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'HttpError'}250 self.mongo.append_error_data(error_data)251 # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')252
253 if response.status in [400, 401, 403, 404, 405, 408, 429, 500, 502, 503, 504]:254 self.logger.error('Got %s response. URL: %s', response.status, response.url)255 # handle the error here256
257 elif failure.check(DNSLookupError):258 request = failure.request259 self.logger.error('DNSLookupError on %s', request.url)260 261 # Save error data on mongodb262 error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'DNSLookupError'}263 self.mongo.append_error_data(error_data)264 # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')265
266 elif failure.check(TCPTimedOutError, TimeoutError):267 request = failure.request268 self.logger.error('TimeoutError on %s', request.url)269
270 # Save error data on mongodb271 error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'TimeoutError'}272 self.mongo.append_error_data(error_data)273 # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')274 else:275 self.logger.error('Unknown error on %s', failure.request.url)276
277 # Save error data on mongodb278 error_data = {'url': failure.request.url, 'timestamp': time.time(), 'status': 'error', 'status_code': response.status, 'error_type': 'Unknown'}279 # print(f'\n\n oh oo.. Unknown error : {error_data}')280 self.mongo.append_error_data(error_data)
src/__main__.py
1"""2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using7`scrapy crawl title_spider`.8
9We recommend you do not modify this file unless you really know what you are doing.10"""11
12# We need to configure the logging first before we import anything else, so that nothing else imports13# `scrapy.utils.log` before we patch it.14from __future__ import annotations15from logging import StreamHandler, getLogger16from typing import Any17from scrapy.utils import log as scrapy_logging18from scrapy.utils.project import get_project_settings19from apify.log import ActorLogFormatter20
21# Define names of the loggers.22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for28# a specific logger, do it in this file.29settings = get_project_settings()30LOGGING_LEVEL = settings['LOG_LEVEL']31
32# Define a logging handler which will be used for the loggers.33apify_handler = StreamHandler()34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:38 """39 Configure a logger with the specified settings.40
41 Args:42 logger_name: The name of the logger to be configured.43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).44 handlers: Optional list of logging handlers.45 """46 logger = getLogger(logger_name)47 logger.setLevel(log_level)48 logger.handlers = []49
50 for handler in handlers:51 logger.addHandler(handler)52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from55# the `main.py` and Scrapy components.56for logger_name in MAIN_LOGGER_NAMES:57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because63# otherwise we would lose some log messages.64old_configure_logging = scrapy_logging.configure_logging65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:68 """69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here72 these four loggers and the root logger.73 """74 old_configure_logging(*args, **kwargs)75
76 # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`77 # property within spiders. See details in the Spider logger property:78 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.79 configure_logger(None, LOGGING_LEVEL, apify_handler)80
81 # We modify other loggers only by setting up their log level. A custom log handler is added82 # only to the root logger to avoid duplicate log messages.83 for logger_name in ALL_LOGGER_NAMES:84 configure_logger(logger_name, LOGGING_LEVEL)85
86 # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless87 # messages, especially when running on the platform.88 configure_logger('httpx', 'WARNING')89
90
91scrapy_logging.configure_logging = new_configure_logging92
93# Now we can do the rest of the setup94import asyncio95import os96import nest_asyncio97from scrapy.utils.reactor import install_reactor98from .main import main99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,103# otherwise, it will not work correctly on Windows.104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')105nest_asyncio.apply()106
107# Specify the path to the Scrapy project settings module108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'109
110# Run the Apify main coroutine111asyncio.run(main())
src/items.py
1"""2Scrapy item models module3
4This module defines Scrapy item models for scraped data. Items represent structured data5extracted by spiders.6
7For detailed information on creating and utilizing items, refer to the official documentation:8https://docs.scrapy.org/en/latest/topics/items.html9"""10
11from scrapy import Field, Item12
13
14class TitleItem(Item):15 """16 Represents a title item scraped from a web page.17 """18
19 url = Field()20 title = Field()
src/main.py
1"""2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline5for pushing data to the Apify dataset.6
7Customization:8--------------9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy11components like spiders and handling Actor input. However, make sure you have a clear understanding of your12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.13
14Documentation:15--------------16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.19"""20
21from __future__ import annotations22
23from scrapy.crawler import CrawlerProcess24
25from apify import Actor26from apify.scrapy.utils import apply_apify_settings27
28# Import your Scrapy spider here29# from .spiders.title import TitleSpider as Spider30from .spiders.worker_spider_v2 import WorkerSpider as Spider31# from .spiders.quotes_spider import QuotesSpider as Spider32# Default input values for local execution using `apify run`33LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]34
35import os36async def main() -> None:37 """38 Apify Actor main coroutine for executing the Scrapy spider.39 """40 async with Actor:41 Actor.log.info('....................................')42 Actor.log.info('Actor is being executed...')43 Actor.log.info(f'env: {os.environ.get("test")}')44 Actor.log.info('....................................')45
46 # Process Actor input47 actor_input = await Actor.get_input() or {}48 start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)49 proxy_config = actor_input.get('proxyConfiguration')50
51 # Add start URLs to the request queue52 rq = await Actor.open_request_queue()53 for start_url in start_urls:54 url = start_url.get('url')55 await rq.add_request(request={'url': url, 'method': 'GET'})56
57 # Apply Apify settings, it will override the Scrapy project settings58 settings = apply_apify_settings(proxy_config=proxy_config)59
60 # Execute the spider using Scrapy CrawlerProcess61 process = CrawlerProcess(settings, install_root_handler=False)62 process.crawl(Spider)63 process.start()
src/middlewares.py
1"""2Scrapy middlewares module3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and5responses, typically used for adding custom headers, retrying requests, and handling exceptions.6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information8on creating and utilizing them, refer to the official documentation:9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html10https://docs.scrapy.org/en/latest/topics/spider-middleware.html11"""12
13from __future__ import annotations14from typing import Generator, Iterable15
16from scrapy import Request, Spider, signals17from scrapy.crawler import Crawler18from scrapy.http import Response19
20# useful for handling different item types with a single interface21from itemadapter import is_item, ItemAdapter22
23
24class TitleSpiderMiddleware:25 # Not all methods need to be defined. If a method is not defined,26 # scrapy acts as if the spider middleware does not modify the27 # passed objects.28
29 @classmethod30 def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:31 # This method is used by Scrapy to create your spiders.32 s = cls()33 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)34 return s35
36 def process_spider_input(self, response: Response, spider: Spider) -> None:37 # Called for each response that goes through the spider38 # middleware and into the spider.39
40 # Should return None or raise an exception.41 return None42
43 def process_spider_output(44 self,45 response: Response,46 result: Iterable,47 spider: Spider,48 ) -> Generator[Iterable[Request] | None, None, None]:49 # Called with the results returned from the Spider, after50 # it has processed the response.51
52 # Must return an iterable of Request, or item objects.53 for i in result:54 yield i55
56 def process_spider_exception(57 self,58 response: Response,59 exception: BaseException,60 spider: Spider,61 ) -> Iterable[Request] | None:62 # Called when a spider or process_spider_input() method63 # (from other spider middleware) raises an exception.64
65 # Should return either None or an iterable of Request or item objects.66 pass67
68 def process_start_requests(69 self, start_requests: Iterable[Request], spider: Spider70 ) -> Iterable[Request]: # Called with the start requests of the spider, and works71 # similarly to the process_spider_output() method, except72 # that it doesn’t have a response associated.73
74 # Must return only requests (not items).75 for r in start_requests:76 yield r77
78 def spider_opened(self, spider: Spider) -> None:79 pass80
81
82class TitleDownloaderMiddleware:83 # Not all methods need to be defined. If a method is not defined,84 # scrapy acts as if the downloader middleware does not modify the85 # passed objects.86
87 @classmethod88 def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:89 # This method is used by Scrapy to create your spiders.90 s = cls()91 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)92 return s93
94 def process_request(self, request: Request, spider: Spider) -> Request | Response | None:95 # Called for each request that goes through the downloader96 # middleware.97
98 # Must either:99 # - return None: continue processing this request100 # - or return a Response object101 # - or return a Request object102 # - or raise IgnoreRequest: process_exception() methods of103 # installed downloader middleware will be called104 return None105
106 def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:107 # Called with the response returned from the downloader.108
109 # Must either;110 # - return a Response object111 # - return a Request object112 # - or raise IgnoreRequest113 return response114
115 def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:116 # Called when a download handler or a process_request()117 # (from other downloader middleware) raises an exception.118
119 # Must either:120 # - return None: continue processing this exception121 # - return a Response object: stops process_exception() chain122 # - return a Request object: stops process_exception() chain123 pass124
125 def spider_opened(self, spider: Spider) -> None:126 pass
src/pipelines.py
1"""2Scrapy item pipelines module3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components5that handle the scraped items, typically used for cleaning, validating, and persisting data.6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:8http://doc.scrapy.org/en/latest/topics/item-pipeline.html9"""10
11from scrapy import Spider12
13from .items import TitleItem14
15
16class TitleItemPipeline:17 """18 This item pipeline defines processing steps for TitleItem objects scraped by spiders.19 """20
21 def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:22 # Do something with the item here, such as cleaning it or persisting it to a database23 return item
src/settings.py
1# Scrapy settings for scrapy_engine project2#3# For simplicity, this file contains only settings considered important or4# commonly used. You can find more settings consulting the documentation:5#6# https://docs.scrapy.org/en/latest/topics/settings.html7# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html8# https://docs.scrapy.org/en/latest/topics/spider-middleware.html9
10BOT_NAME = "scrapy_engine"11
12# ----------------------------------------------------13# code from default_spider14# BOT_NAME = 'titlebot'15# DEPTH_LIMIT = 116LOG_LEVEL = 'INFO'17REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'18ROBOTSTXT_OBEY = False19
20NEWSPIDER_MODULE = 'src.spiders'21SPIDER_MODULES = ['src.spiders']22# ITEM_PIPELINES = {23# 'src.pipelines.TitleItemPipeline': 123,24# }25# SPIDER_MIDDLEWARES = {26# 'src.middlewares.TitleSpiderMiddleware': 543,27# }28# DOWNLOADER_MIDDLEWARES = {29# 'src.middlewares.TitleDownloaderMiddleware': 543,30# }31# ----------------------------------------------------32
33
34# # Crawl responsibly by identifying yourself (and your website) on the user-agent35# #USER_AGENT = "scrapy_engine (+http://www.yourdomain.com)"36
37# # Obey robots.txt rules38# ROBOTSTXT_OBEY = False39
40# # Configure maximum concurrent requests performed by Scrapy (default: 16)41# CONCURRENT_REQUESTS = 15 # 10 for replit # 3242
43# # Configure a delay for requests for the same website (default: 0)44# # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay45# # See also autothrottle settings and docs46# DOWNLOAD_DELAY = 1 # 347# # The download delay setting will honor only one of:48# # CONCURRENT_REQUESTS_PER_DOMAIN = 1649# #CONCURRENT_REQUESTS_PER_IP = 1650
51# # Disable cookies (enabled by default)52# COOKIES_ENABLED = False53
54# # Disable Telnet Console (enabled by default)55# #TELNETCONSOLE_ENABLED = False56
57# # Override the default request headers:58# #DEFAULT_REQUEST_HEADERS = {59# # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",60# # "Accept-Language": "en",61# #}62
63# # Enable or disable spider middlewares64# # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html65# #SPIDER_MIDDLEWARES = {66# # "scrapy_engine.middlewares.ScrapyEngineSpiderMiddleware": 543,67# #}68
69# # Enable or disable downloader middlewares70# # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html71# #DOWNLOADER_MIDDLEWARES = {72# # "scrapy_engine.middlewares.ScrapyEngineDownloaderMiddleware": 543,73# #}74
75# # Enable or disable extensions76# # See https://docs.scrapy.org/en/latest/topics/extensions.html77# #EXTENSIONS = {78# # "scrapy.extensions.telnet.TelnetConsole": None,79# #}80
81# # Configure item pipelines82# # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html83# #ITEM_PIPELINES = {84# # "scrapy_engine.pipelines.ScrapyEnginePipeline": 300,85# #}86
87# # Enable and configure the AutoThrottle extension (disabled by default)88# # See https://docs.scrapy.org/en/latest/topics/autothrottle.html89# AUTOTHROTTLE_ENABLED = True # dynamically adjusts the download delay based on server responses. 90# # The initial download delay91# AUTOTHROTTLE_START_DELAY = 2#592# # The maximum download delay to be set in case of high latencies93# AUTOTHROTTLE_MAX_DELAY = 30# 6094# # The average number of requests Scrapy should be sending in parallel to95# # each remote server96# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.097# # Enable showing throttling stats for every response received:98# #AUTOTHROTTLE_DEBUG = False99
100# # Enable and configure HTTP caching (disabled by default)101# # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings102# HTTPCACHE_ENABLED = False # True103# #HTTPCACHE_EXPIRATION_SECS = 0104# #HTTPCACHE_DIR = "httpcache"105# #HTTPCACHE_IGNORE_HTTP_CODES = []106# #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"107
108# # Set settings whose default value is deprecated to a future-proof value109# REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"110# TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"111# FEED_EXPORT_ENCODING = "utf-8"112
113
114# LOG_LEVEL='INFO' # overview of program execution115# #LOG_LEVEL='DEBUG' # detailed information about program execution116# #LOG_LEVEL='WARNING' # only show warnings and errors117# # LOG_LEVEL='ERROR' # only show errors118# #LOG_LEVEL='CRITICAL' # only show critical errors119# #LOG_LEVEL='NOTSET' # show everything120
121
122
123
124
125# # ## ---------------------------------------------------126# # ## ---------------- REDIS-SPECIFIC SETTINGS -----------127# # ## ---------------------------------------------------128# # # Enables scheduling storing requests queue in redis.129# # SCHEDULER = "scrapy_redis.scheduler.Scheduler"130
131# # # Ensure all spiders share same duplicates filter through redis.132# # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"133
134# # # Redis Connection URL135# # REDIS_URL = 'redis://default:saQznB445IS0AEypcHVpzLKtGTxXilui@redis-13950.c292.ap-southeast-1-1.ec2.cloud.redislabs.com:13950'
src/settings_default.py
1"""2Scrapy settings module3
4This module contains Scrapy settings for the project, defining various configurations and options.5
6For more comprehensive details on Scrapy settings, refer to the official documentation:7http://doc.scrapy.org/en/latest/topics/settings.html8"""9
10# You can update these options and add new ones11BOT_NAME = 'titlebot'12DEPTH_LIMIT = 113LOG_LEVEL = 'INFO'14NEWSPIDER_MODULE = 'src.spiders'15REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'16ROBOTSTXT_OBEY = True17SPIDER_MODULES = ['src.spiders']18ITEM_PIPELINES = {19 'src.pipelines.TitleItemPipeline': 123,20}21SPIDER_MIDDLEWARES = {22 'src.middlewares.TitleSpiderMiddleware': 543,23}24DOWNLOADER_MIDDLEWARES = {25 'src.middlewares.TitleDownloaderMiddleware': 543,26}
.dockerignore
# Git folder.git
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify[scrapy] ~= 1.7.05nest-asyncio ~= 1.5.86scrapy ~= 2.11.17
8
9
10langid==1.1.611pymongo==4.6.112python-dotenv==1.0.013# redis==5.0.114requests==2.31.015Twisted==22.10.0
scrapy.cfg
[settings]default = src.settings
[deploy]project = src