# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.11

# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./

# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
 && python --version \
 && echo "Pip version:" \
 && pip --version \
 && echo "Installing dependencies:" \
 && pip install -r requirements.txt \
 && echo "All installed Python packages:" \
 && pip freeze

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .

# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
    "actorSpecification": 1,
    "name": "my-actor",
    "title": "Getting started with Python and Scrapy",
    "description": "Scrapes titles of websites using Scrapy.",
    "version": "0.0",
    "meta": {
        "templateId": "python-scrapy"
    },
    "input": "./input_schema.json",
    "dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
    "title": "Python Scrapy Scraper",
    "type": "object",
    "schemaVersion": 1,
    "properties": {
        "startUrls": {
            "title": "Start URLs",
            "type": "array",
            "description": "URLs to start with",
            "prefill": [
                { "url": "https://apify.com" }
            ],
            "editor": "requestListSources"
        },
        "proxyConfiguration": {
            "sectionCaption": "Proxy and HTTP configuration",
            "title": "Proxy configuration",
            "type": "object",
            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
            "editor": "proxy",
            "prefill": { "useApifyProxy": true },
            "default": { "useApifyProxy": true }
        }
    },
    "required": ["startUrls"]
}

src/spiders/init.py

1"""
2Scrapy spiders package
3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
5and process data from websites.
6
7For detailed information on creating and utilizing spiders, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/spiders.html
9"""

src/spiders/functions.py

1import langid
2import re
3import os, sys, json, csv
4# import pandas as pd
5
6# import pybloom_live
7from pathlib import Path
8from urllib.parse import urlparse
9
10def is_valid_text_naive(text):
11    stripped_text = text.strip()
12    return stripped_text and len(stripped_text) > 3
13
14def remove_fragments_from_url(url):
15    parsed_url = urlparse(url)
16    return parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
17
18
19def load_env_var_in_google_colab():
20    # Assuming environment variables are set in google colab Secrets
21
22    if 'google.colab' in sys.modules:
23        # Code is running in google colab
24        try:
25            from google.colab import userdata
26            environ_variables = ["REDIS_HOST", "REDIS_PASSWORD", "REDIS_PORT", "mongo_password", "mongo_username"]
27            for env_var in environ_variables:
28                try:
29                    os.environ[env_var] = userdata.get(env_var)
30                except Exception as Ex:
31                  print(Ex)
32                  # break
33            os.environ.get("mongo_password")
34        except Exception as Ex:
35                  print(Ex)
36
37def merge_crawled_json_files():
38  '''
39        * Crawled data is saved in .json files.
40        * There are multiple .json files.
41        * this functionoi Combines them.
42    '''
43  data_files = set([
44      file.split('_')[0].split('.json')[0] for file in os.listdir()
45      if (file.endswith('.json')) and (
46          file not in
47          ['test.json', 'news_start_urls copy.json', 'news_start_urls.json'])
48  ])
49  merged_data = []
50  for file in data_files:
51    if remove_file_if_empty(file):
52      # Removed empty file
53      continue
54    try:
55      with open(file, 'r') as f:
56        data = json.load(f)
57    except:
58      '''
59               file is corrupt when scrapy is terminated while it is still crawling.
60               while corrupt, file is terminated with: `{some_data},`
61               adding : `""]` at the end of file to make it valid json file
62
63            '''
64      with open(new_file_name, 'a') as file:
65        file.write("\"\"]")
66
67      with open(new_file_name, 'r') as file:
68        data = json.load(file)
69    merged_data.extends(data)
70
71def merge_same_named_json_files(delete_merged=False):
72  '''
73        * Crawled data is saved in .json files.
74        * There are multiple .json files.
75        * this functionoi Combines them.
76    '''
77  data_files = [
78      file for file in os.listdir() if (file.endswith('.json')) and (
79          file not in
80          ['test.json', 'news_start_urls copy.json', 'news_start_urls.json']) and not file.endswith('_merged_.json')
81  ]
82  merged_data = []
83  merged_file_name = '_merged_.json'
84  for file in data_files:
85    # merge only if it is less than 10 Mb
86    if os.path.getsize(file) > 10000000:
87      continue
88    # merged_file_name = file.split('_')[0].split(
89    #     '.json')[0] + '_merged_.json'
90    # remove_file_if_empty(merged_file_name)
91    print(f'\n\n merged_file_name: {merged_file_name}\n')
92    if remove_file_if_empty(file):
93      # Removed empty file
94      continue
95    try:
96      with open(file, 'r') as f:
97        data = json.load(f)
98    except:
99      '''
100               file is corrupt when scrapy is terminated while it is still crawling.
101               while corrupt, file is terminated with: `{some_data},`
102               adding : `""]` at the end of file to make it valid json file
103
104            '''
105      with open(file, 'a') as f:
106        f.write(",{}]")
107      print(f'\n\n file: {file}\n')
108      with open(file, 'r') as f:
109        data = json.load(f)
110    # load if file exists
111    if os.path.exists(merged_file_name):
112      try:
113        with open(merged_file_name, 'r') as f:
114          old_data = json.load(f)
115        if not old_data:
116          old_data = []
117        old_data.extend(data)
118      except:
119        print(f'\n\n-------------- Not merged_file_name: {merged_file_name}\n')
120        continue
121    else:
122      old_data = data
123      if not old_data:
124        old_data = []
125    if old_data:
126      with open(merged_file_name, 'w') as f:
127        json.dump(old_data, f)
128    if delete_merged:
129      os.remove(file)
130
131def compress_file(input_file_path="nepali_news_dataset.csv",
132                  output_file_path="nepali_news_dataset.csv.gz"):
133  '''
134  '''
135  with open(input_file_path, 'rb') as csv_file:
136    with gzip.open(output_file_path, 'wb') as compressed_file:
137      compressed_file.writelines(csv_file)
138
139
140# def save_nepali_paragraphs_to_csv(csv_file_name = "crawled_nepali_news_dataset.csv"):
141#   '''
142#     '''
143#   data_files = [
144#       file for file in os.listdir() if (file.endswith('.json')) and (
145#           file not in
146#           ['test.json', 'news_start_urls copy.json', 'news_start_urls.json'])
147#   ]
148#   for file in data_files:
149#     if remove_file_if_empty(file):
150#       continue
151#       # Removed empty file
152#     try:
153#       # load data from each file
154#       with open(file, 'r') as f:
155#         data = json.load(f)
156#     except:
157#       '''
158#                file is corrupt when scrapy is terminated while it is still crawling.
159#                while corrupt, file is terminated with: `{some_data},`
160#                adding : `""]` at the end of file to make it valid json file
161
162#       '''
163#       with open(file, 'a') as f:
164#         f.write(",{}]")
165#       with open(file, 'r') as f:
166#         data = json.load(f)
167#     nepali_paragraphs = []
168#     for d in data:
169#       if type(d) == dict:
170#         if 'paragraph' in d.keys():
171#           nepali_paragraphs.append(d)
172#           #   print(list(d.values()))
173#           #   # save dataset in a csv file
174#           #   with open(csv_file_name, 'a', newline='') as csv_file:
175#           #     # Create a CSV writer object
176#           #     csv_writer = csv.writer(csv_file)
177#           #     # Write the data to the CSV file
178#           #     csv_writer.writerows(list(d.values()))
179#           print(f'len_nepali_paragraphs:{len(nepali_paragraphs)} keys:{nepali_paragraphs[0].keys()}')
180#     # Open the CSV file in append mode
181#     with open(csv_file_name, 'a', newline='') as csv_file:
182#         # Create a CSV writer object
183#         csv_writer = csv.writer(csv_file)
184
185#         # Check if the CSV file is empty
186#         is_empty = os.path.getsize(csv_file_name) == 0
187        
188#         # Extract unique column names from dictionary keys
189#         column_names = set(key for d in nepali_paragraphs for key in d.keys())
190        
191#         # Write the header only if the CSV file is empty
192#         if is_empty:
193#             csv_writer.writerow(column_names)
194        
195#         # Iterate through each dictionary and write rows to CSV
196#         for d in nepali_paragraphs:
197#             # Create a list of values for the row, using empty string for missing keys
198#             row_values = [str(d.get(column, '')) for column in column_names]
199#             csv_writer.writerow(row_values)
200#     # ----------------------------
201#     # Drop duplicate rows in csv
202#     # ----------------------------
203    
204#     # Read the CSV file into a DataFrame
205#     df = pd.read_csv(csv_file_name)
206#     # Drop duplicate rows
207#     df.drop_duplicates(inplace=True)
208#     # Write the cleaned DataFrame back to the CSV file
209#     df.to_csv(csv_file_name, index=False)
210
211def remove_file_if_empty(file_path):
212  """Checks if a file is empty and removes it if it is.
213
214    Args:
215        file_path (str): The path to the file to check.
216
217    Returns:
218        bool: True if the file was empty and removed, False otherwise.
219    """
220  if os.path.exists(file_path):
221    if os.path.getsize(file_path) == 0:
222      # File size is 0 bytes
223      try:
224        os.remove(file_path)
225        print(f"Removed empty file: {file_path}")
226        return True
227      except OSError as e:
228        print(f"Error removing file: {e}")
229        return False
230    else:
231      try:
232        if os.path.getsize(file_path) > 10000000:
233            # Open only files less than 10 Mb
234            return False
235        with open(file_path, 'r') as f:
236          data = json.load(f)
237      except Exception as Ex:
238        print(f'----------------------------------- Exception: {Ex} -----------------------------------\n  file_path: {file_path}\n ')
239        with open(file_path, 'a') as f:
240          f.write(",{}]")
241        with open(file_path, 'r') as f:
242          data = json.load(f)
243      if not data:
244        # File is empty
245        try:
246          os.remove(file_path)
247          print(f"Removed empty file: {file_path}")
248          return True
249        except OSError as e:
250          print(f"Error removing file: {e}")
251          return False
252  else:
253    print(f"File is not empty or does not exist: {file_path}")
254    return False
255
256
257# def get_resume_urls(domain_name_to_resume_from):
258#   '''
259#     -------------
260#      pseudocode:
261#     -------------
262
263#     domain_name_to_resume_from = ekantipur
264
265#     while there is file ['ekantipur.json', 'ekantipur_0.json', 'ekantipur_1.json', ...]:
266#         urls_to_visit.add(to_visited_urls)
267#         urls_visited.add(visited_urls)
268
269#     fro url in urls_to_visit:
270#         add url to news_urls if url is not present in urls_visited
271
272#     return list(news_urls)
273#     '''
274
275#   new_file_name = domain_name_to_resume_from + '.json'
276#   remove_file_if_empty(new_file_name)
277
278#   urls_to_visit = set()
279#   urls_visited = pybloom_live.ScalableBloomFilter(
280#       mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)
281#   news_start_ulrs = set()
282#   index = 0
283  
284#   # load data from  merged file
285#   if os.path.exists(domain_name_to_resume_from + '_merged_.json'): 
286#     if not remove_file_if_empty(new_file_name):
287#         # File not empty
288#         with open(domain_name_to_resume_from + '_merged_.json','r') as f:
289#             data = json.load(f)
290#         if data:
291#             for each_data in data:
292#                 if type(each_data) == dict and 'to_visit' in each_data.keys():
293#                     urls_to_visit.add(
294#                         each_data['to_visit'])  # each_data['to_visit'] is a of urls
295#                 if type(each_data) == dict:
296#                     if 'visited' in each_data.keys():
297#                         urls_visited.add(
298#                             each_data['visited'])  # each_data['visited'] is a url
299
300#   while os.path.exists(new_file_name):
301#     if remove_file_if_empty(new_file_name):
302#       break
303#     try:
304#       with open(new_file_name, 'r') as file:
305#         data = json.load(file)
306#     except:
307#       '''
308#                file is corrupt when scrapy is terminated while it is still crawling.
309#                while corrupt, file is terminated with: `{some_data},`
310#                adding : `""]` at the end of file to make it valid json file
311
312#             '''
313#       with open(new_file_name, 'a') as file:
314#         file.write("\"\"]")
315
316#       with open(new_file_name, 'r') as file:
317#         data = json.load(file)
318#     for each_data in data:
319#       if type(each_data) == dict and 'to_visit' in each_data.keys():
320#         urls_to_visit.add(
321#             each_data['to_visit'])  # each_data['to_visit'] is a of urls
322
323#       if type(each_data) == dict:
324#         if 'visited' in each_data.keys():
325#           urls_visited.add(
326#               each_data['visited'])  # each_data['visited'] is a url
327
328#     new_file_name = domain_name_to_resume_from + f'_{index}.json'
329#     # remove file if file is empty
330#     remove_file_if_empty(new_file_name)
331
332#     index += 1
333
334#   print(f'\n\n new_file_name from function:{new_file_name}')
335#   for to_visit in urls_to_visit:
336#     if to_visit not in urls_visited:
337#       news_start_ulrs.add(to_visit)
338
339#   return list(news_start_ulrs), urls_visited
340
341
342def is_nepali_language(text):
343    lang, confidence = langid.classify(text)
344    return lang == 'hi', confidence
345    # is_nepali("बल्ल बुझायो एनसेलले खरिद-बिक्री विवरण")
346
347
348def is_google_drive_link(link):
349  # return id of drive link
350
351  if link.startswith('https://drive.google.com/file/d/'):
352    drive_id = link.split('https://drive.google.com/file/d/')[1].split(
353        '/')[0].split('?')[0]
354  elif link.startswith('https://drive.google.com/drive/folders/'):
355    drive_id = link.split('https://drive.google.com/drive/folders/')[1].split(
356        '/')[0].split('?')[0]
357  elif link.startswith('https://drive.google.com/open?id='):
358    drive_id = link.split('https://drive.google.com/open?id=')[1].split(
359        '/')[0].split('?')[0]
360  elif link.startswith('https://drive.google.com/drive/u/0/folders/'):
361    drive_id = link.split('https://drive.google.com/drive/u/0/folders/'
362                          )[1].split('/')[0].split('?')[0]
363  elif link.startswith('https://drive.google.com/drive/u/1/folders/'):
364    drive_id = link.split('https://drive.google.com/drive/u/1/folders/'
365                          )[1].split('/')[0].split('?')[0]
366  elif link.startswith('https://drive.google.com/drive/u/2/folders/'):
367    drive_id = link.split('https://drive.google.com/drive/u/2/folders/'
368                          )[1].split('/')[0].split('?')[0]
369  elif link.startswith('https://drive.google.com/drive/u/3/folders/'):
370    drive_id = link.split('https://drive.google.com/drive/u/3/folders/'
371                          )[1].split('/')[0].split('?')[0]
372  elif link.startswith('https://drive.google.com/drive/u/0/file/d'):
373    drive_id = link.split('https://drive.google.com/drive/u/0/file/d/'
374                          )[1].split('/')[0].split('?')[0]
375  elif link.startswith('https://drive.google.com/drive/u/1/file/d/'):
376    drive_id = link.split('https://drive.google.com/drive/u/1/file/d/'
377                          )[1].split('/')[0].split('?')[0]
378  elif link.startswith('https://drive.google.com/drive/u/2/file/d/'):
379    drive_id = link.split('https://drive.google.com/drive/u/2/file/d/'
380                          )[1].split('/')[0].split('?')[0]
381  elif link.startswith('https://drive.google.com/drive/u/3/file/d/'):
382    drive_id = link.split('https://drive.google.com/drive/u/3/file/d/'
383                          )[1].split('/')[0].split('?')[0]
384  elif link.startswith('https://drive.google.com/drive/mobile/folders/'):
385    drive_id = link.split('https://drive.google.com/drive/mobile/folders/'
386                          )[1].split('/')[0].split('?')[0]
387  elif link.startswith('https://drive.google.com/folderview?id='):
388    drive_id = link.split('https://drive.google.com/folderview?id=')[1].split(
389        '/')[0].split('?')[0]
390  else:
391    # return original link
392    return False, link
393
394  return True, drive_id
395
396
397def is_same_domain(url1, url2):
398  return urlparse(url1).netloc == urlparse(url2).netloc
399
400
401def is_np_domain(url):
402  # return true if it is one of first 1000 .np domain
403  parsed_url = urlparse(url)
404  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
405
406  # crawl it if .np domain and len(visited_urls_base) < 1000
407
408  return base_url.endswith('.np') or base_url.endswith('.np/')
409
410def is_special_domain_to_crawl(url):
411    # crawl some non '.np' domains
412    return url.startswith("https://www.bbc.com/nepali/") or url.startswith("https://np.usembassy.gov/ne/")
413
414def is_document_or_media(url):
415  is_social_media, _ = is_social_media_link(url)
416  is_document, _, _ = is_document_link(url)
417  if is_social_media or is_document:
418    return True
419  return False
420def should_we_crawl_it(new_url, visited_urls_base=None):
421  # return true if it is one of first 1000 .np domain
422  parsed_url = urlparse(new_url)
423  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
424
425  # crawl it if .np domain and len(visited_urls_base) < 50
426
427  if base_url[-3:] == '.np' and (visited_urls_base == None
428                                 or len(visited_urls_base) < 50):
429    return base_url, True
430  else:
431    return base_url, False
432
433
434# import re
435def is_social_media_link(link):
436  social_media_patterns = {
437      'facebook': r"facebook\.com",
438      'instagram': r"instagram\.com",
439      'twitter': r"twitter\.com",
440      'x': r"x\.com",
441      'tiktok': r"tiktok\.com",
442      'linkedin': r"linkedin\.com",
443      'pinterest': r"pinterest\.com",
444      'youtube': r"youtube\.com|youtu\.be",
445      'reddit': r"reddit\.com",
446      'snapchat': r"snapchat\.com",
447      'quora': r"quora\.com",
448      'tumblr': r"tumblr\.com",
449      'flickr': r"flickr\.com",
450      'medium': r"medium\.com",
451      'vimeo': r"vimeo\.com",
452      'vine': r"vine\.co",
453      'periscope': r"periscope\.tv",
454      'meetup': r"meetup\.com",
455      'mix': r"mix\.com",
456      'soundcloud': r"soundcloud\.com",
457      'behance': r"behance\.net",
458      'dribbble': r"dribbble\.com",
459      'vk': r"vk\.com",
460      'weibo': r"weibo\.com",
461      'ok': r"ok\.ru",
462      'deviantart': r"deviantart\.com",
463      'slack': r"slack\.com",
464      'telegram': r"t\.me|telegram\.me",
465      'whatsapp': r"whatsapp\.com",
466      'line': r"line\.me",
467      'wechat': r"wechat\.com",
468      'kik': r"kik\.me",
469      'discord': r"discord\.gg",
470      'skype': r"skype\.com",
471      'twitch': r"twitch\.tv",
472      'myspace': r"myspace\.com",
473      'badoo': r"badoo\.com",
474      'tagged': r"tagged\.com",
475      'meetme': r"meetme\.com",
476      'xing': r"xing\.com",
477      'renren': r"renren\.com",
478      'skyrock': r"skyrock\.com",
479      'livejournal': r"livejournal\.com",
480      'fotolog': r"fotolog\.com",
481      'foursquare': r"foursquare\.com",
482      'cyworld': r"cyworld\.com",
483      'gaiaonline': r"gaiaonline\.com",
484      'blackplanet': r"blackplanet\.com",
485      'care2': r"care2\.com",
486      'cafemom': r"cafemom\.com",
487      'nextdoor': r"nextdoor\.com",
488      'kiwibox': r"kiwibox\.com",
489      'cellufun': r"cellufun\.com",
490      'tinder': r"tinder\.com",
491      'bumble': r"bumble\.com",
492      'hinge': r"hinge\.co",
493      'match': r"match\.com",
494      'okcupid': r"okcupid\.com",
495      'zoosk': r"zoosk\.com",
496      'plentyoffish': r"pof\.com",
497      'eharmony': r"eharmony\.com",
498      'coffee_meets_bagel': r"coffeemeetsbagel\.com",
499      'her': r"weareher\.com",
500      'grindr': r"grindr\.com",
501      'happn': r"happn\.com",
502      'hily': r"hily\.com",
503      'huggle': r"huggle\.com",
504      'jdate': r"jdate\.com",
505      'lovoo': r"lovoo\.com",
506      'meetmindful': r"meetmindful\.com",
507      'once': r"once\.com",
508      'raya': r"raya\.app",
509      'ship': r"getshipped\.com",
510      'silversingles': r"silversingles\.com",
511      'tastebuds': r"tastebuds\.fm",
512      'the_league': r"theleague\.com",
513      'tudder': r"tudder\.com",
514      'twoo': r"twoo\.com",
515  }
516
517  for social_media, pattern in social_media_patterns.items():
518    if (re.search(r'https://www\.' + pattern, link, flags=re.IGNORECASE)) or (
519        (re.search(r'https://www\.m\.' + pattern, link, flags=re.IGNORECASE))):
520      return True, social_media
521
522  return False, None
523
524
525# is_social_media_link('https://www.facebook.com/sth.s/ste') # (True, 'facebook')
526# is_social_media_link('https://www.m.youtube.com/')# (True, 'youtube') # (True, 'youtube')
527# is_social_media_link('https://www.google.com/search?q=youtube.com') # (False, None)
528
529
530def is_document_link(link):
531  # document extensions
532  document_extensions = [
533      '.pdf', '.odt', '.docx', '.doc', '.pst', '.ai', '.drw', '.dxf', '.eps',
534      '.ps', '.svg', '.cdr', '.odg'
535  ]
536  # presentation extensions
537  presentation_extensions = ['.ppt', '.pptx', '.odp', '.pps']
538  # spreadsheet extensions
539  spreadsheet_extensions = ['.xls', '.xlsx', '.ods']
540  # archive extensions
541  archive_extensions = [
542      '.zip', '.rar', '.tar', '.gz', '.7z', '.7zip', '.bz2', '.tar.gz', '.xz'
543  ]
544  # video extensions
545  video_extensions = [
546      '.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.mng', '.mpg', '.qt',
547      '.rm', '.swf', '.m4v', '.webm'
548  ]
549  # audio extensions
550  audio_extensions = [
551      '.mp3', '.wav', '.ogg', '.wma', '.ra', '.aac', '.mid', '.au', '.aiff',
552      '.3gp', '.asf', '.asx', '.m4a'
553  ]
554  # image extensions
555  image_extensions = [
556      '.bmp', '.gif', '.jpg', '.jpeg', '.png', '.tif', '.tiff', '.ico',
557      '.webp', '.mng', '.pct', '.psp'
558  ]
559  # other extensions
560  other_extensions = ['.css', '.exe', '.bin', '.rss', '.dmg', '.iso', '.apk']
561  
562  extensions = {
563      'document': document_extensions,
564      'presentation': presentation_extensions,
565      'spreadsheet': spreadsheet_extensions,
566      'archive': archive_extensions,
567      'video': video_extensions,
568      'audio': audio_extensions,
569      'image': image_extensions,
570      'other': other_extensions
571  }
572  
573  for doc_type, extension_list in extensions.items():
574    for extension in extension_list:
575      if link.lower().endswith(extension):
576        return True, doc_type, (
577            doc_type == 'other'
578        )  # doc_type == 'image'  ignore image or other docs
579  return False, None, True
580
581
582# is_document_link('https://acem.edu.np/uploads/userfiles/files/old_question/8th%20Semester/BCTBEXBEL/Wind%20Energy%202.PDF')
583# is_document, document_type = is_document_link('https://www.google.com/link.pdf')

src/spiders/mongo.py

1import csv
2import os
3import time
4from pymongo.server_api import ServerApi
5
6from dotenv import load_dotenv
7load_dotenv()
8
9# Creating mangodb
10from pymongo import MongoClient
11
12class Mongo():
13    def __init__(self, db_name='scrapy-engine', collection_name="urls-collection", local=False):
14        if local:
15            client = MongoClient('localhost', 27017)
16            self.db = client[db_name]
17            self.collection = client[db_name][collection_name]
18            
19            # Ensure Index is Created for faster search operations
20            self.collection.create_index('url', unique=True)
21        else:
22            
23            uri = f"mongodb+srv://jokeleopedia:{os.environ.get('mongo_password')}@scrapy-engine.5cqch4y.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"
24            # uri = f"mongodb+srv://{os.environ.get('mongo_username')}:{os.environ.get('mongo_password')}@scrapy-engine.xwugtdk.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"
25            # uri = f"mongodb+srv://{os.environ.get('user_heroku')}:{os.environ.get('pass_heroku')}@cluster0.dgeujbs.mongodb.net/?retryWrites=true&w=majority"
26            
27            # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
28            client = MongoClient(uri, server_api=ServerApi('1'))
29            
30            # Create the database for our example (we will use the same database throughout the tutorial
31            self.db = client[db_name]
32            self.collection = self.db[collection_name]  # using single collection for all urls
33            # # one time operation
34            # self.collection.create_index('url', unique=True)
35    
36    def append_error_data(self, data):
37        '''
38        # e.g. first add new data and update it
39        
40        import time
41        from mongo import Mongo
42        mongo = Mongo()
43
44        error_data = {'url':'https://google.com', 'timestamp':time.time(), 'status':'error', 'status_code':400, 'error_type':'HttpError'}
45        error_data2 = {'url':'https://google.com', 'timestamp':'now', 'status':'not-error', 'status_code':'errororrororo', 'error_type':'HttpError'}
46
47        # insert error_data
48        mongo.db['test'].update_one(
49                {'url': error_data['url']},
50                {'$set': error_data}, # data is dict with error info
51                upsert=True
52            )
53        list(mongo.db['test'].find({'url':'https://google.com'}))   # output: [{'_id': ObjectId('66a9c58c0bee3830ff4dcba3'), 'url': 'https://google.com', 'error_type': 'HttpError', 'status': 'error', 'status_code': 400, 'timestamp': 1722402186.1221087}]
54
55        # update with error_data2
56        mongo.db['test'].update_one(
57                {'url': error_data2['url']},
58                {'$set': error_data2}, # data is dict with error info
59                upsert=True
60            )
61        list(mongo.db['test'].find({'url':'https://google.com'}))   # output: [{'_id': ObjectId('66a9c58c0bee3830ff4dcba3'), 'url': 'https://google.com', 'error_type': 'HttpError', 'status': 'not-error', 'status_code': 'errororrororo', 'timestamp': 'now'}]
62        '''
63
64        # upsert the  error data
65        self.collection.update_one(
66                {'url': data['url']},
67                {'$set': data}, # data is dict with error info
68                upsert=True
69            )
70        
71        # Delete url if it's status is either 'to_crawl' or crawling
72        # if status is crawled, try/except should avoid creating error 
73        # results = list(self.collection.find({'url':data['url'], 'status': {'$in': ['to_crawl', 'crawling']}}))
74        # if results:
75        #     self.collection.delete_one({'_id':results[0]['_id']})
76        
77        # try:
78        #     # Try inserting url
79        #     self.collection.insert_one(data)
80        #     return True # inserted
81        # except  Exception as ex:
82        #     print(ex)
83        #     return   # error data exists
84
85    def append_url_crawled(self, url):
86        try:
87            # Try inserting url
88            self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'crawled'})
89        except  Exception as ex:
90            # url exists: change status to 'crawled'
91            self.collection.update_one({'url':url}, {'$set': {'status':'crawled'}})
92
93    def append_url_crawling(self, url):
94        try:
95            # Try inserting url
96            self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'crawling'})
97            return True # inserted
98        except  Exception as ex:
99            # modify to_crawl to crawling
100            result = self.collection.update_one(
101                {'url': url, 'status': {'$in': ['to_crawl']}},
102                {'$set': {'status':'crawling'}}, 
103            )
104            if result.modified_count > 0:
105                # print("url was in to_crawl and now changed to crawling")
106                return True
107            else:
108                # print("it is either already crawling or crawled")
109                return False
110    
111    def append_url_to_crawl(self, url):
112        if type(url) == list:
113            # Insert multiple urls
114            '''
115            ordered=False: If an error occurs during the processing of one of the write operations, MongoDB will continue to process remaining write operations in the queue.
116            '''
117            try:
118                self.collection.insert_many([{'url':u, 'timestamp':time.time(), 'status':'to_crawl?'} for u in url], ordered=False)
119            except Exception as ex:
120                pass
121                # print(ex)
122        elif type(url) == str:
123            try:
124                # Try inserting url
125                self.collection.insert_one({'url':url, 'timestamp':time.time(), 'status':'to_crawl?'})
126                return True # inserted
127            except  Exception as ex:
128                pass
129                # print(ex)    # url exists
130    
131    def check_connection(self):        
132        # Create a new client and connect to the server
133        uri = f"mongodb+srv://jokeleopedia:{os.environ.get('mongo_password')}@scrapy-engine.5cqch4y.mongodb.net/?retryWrites=true&w=majority&appName=scrapy-engine"
134        client = MongoClient(uri, server_api=ServerApi('1'))
135        # ping to confirm a successful connection
136        try:
137            client.admin.command('ping')
138            print("Pinged your deployment. You successfully connected to MongoDB!")
139        except Exception as e:
140            print(e)
141    
142    def delete_to_crawl(self, url):
143        try:
144            self.collection.delete_one({'url':url, 'status':'to_crawl'})
145        except Exception as ex:
146            pass
147            print(ex)
148    
149    def export_local_mongo(self):
150        '''
151        # Export the data from local_mongo to a csv file 'local_mongo.csv'
152        with headers: 'url', 'timestamp', 'status'
153
154
155        # Comments
156        * 10K at a time was very slow. 100K at a time is still slow though..
157        '''
158        def save_to_csv(data, csv_file_path='local_mongo.csv', field_names=['url', 'timestamp', 'status']):
159            file_exists = os.path.exists(csv_file_path)
160            with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
161                # Create a CSV writer object
162                csv_writer = csv.DictWriter(csvfile, fieldnames=field_names)
163                
164                # If the file doesn't exist, write the header
165                if not file_exists:
166                    print('writing header')
167                    csv_writer.writeheader()
168                
169                # Write the data to the CSV file
170                try:
171                    # Save data to csv file
172                    csv_writer.writerows(data)
173                except Exception as ex:
174                    print(ex)
175        # save_to_csv(entries)
176        
177        entries_count = local_mongo.collection.count_documents({})
178        print(f'Exporting {entries_count} entries')
179        # get all entries 10000 at a time
180        for i in range(0, entries_count, 100000):
181            entries = list(self.collection.find({}).skip(i).limit(100000))
182            entries = [{'url':entry['url'], 'timestamp':entry['timestamp'], 'status':entry['status']} for entry in entries]
183            print(f'Exporting {i} to {i+100000} entries. step: {int(i/100000)}/{int(entries_count/100000)}')
184            save_to_csv(entries)
185        
186    def import_to_local_mongo(self):
187        '''
188        # Load the data from local_mongo.csv to local_mongo
189        * 10K at a time
190        '''
191        def load_from_csv(csv_file_path='local_mongo.csv', field_names=['url', 'timestamp', 'status']):
192            def save_to_mongo(self, entries):
193                # Insert multiple data
194                '''
195                ordered=False: If an error occurs during the processing of one of the write operations, MongoDB will continue to process remaining write operations in the queue.
196                '''
197                try:
198                    # self.collection.insert_many([{'url':data['url'], 'timestamp':data['timestamp'], 'status':data['status']} for data in datas], ordered=False)
199                    self.collection.insert_many(entries, ordered=False)
200                except Exception as ex:
201                    pass
202                    # print(ex)
203            data = []
204            import_count = 0
205            with open(csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:
206                # Create a CSV reader object
207                csv_reader = csv.DictReader(csvfile)
208                for row in csv_reader:
209                    '''
210                    format of data is:
211                    [
212                        {
213                            'url': 'https://psc.gov.np/others/4/d46d4ca7616bcde106e08f4b8636a247e84450197b6062248f2cb8cd2f550277b55612bd7320cd3f2d7aec2ada0707ba992e0a2bfb66a7154cf921a2ae37028964.NDLeVePim0d~oLhNjuuA4I~L7sdJOUPqysLgcC6c-.html',
214                            'timestamp': '1716021918.59285',
215                            'status': 'crawled'
216                        }, 
217                        ...
218                    ]
219                    ''' 
220                    data.append(row)
221                    if len(data) > 10000:
222                        # Save to mongo
223                        save_to_mongo(self, data)
224                        data = []   # re-initialize data
225                        import_count += 10000
226                        print(f'Import count: {import_count} iteration: {int(import_count/10000)}')
227            # return data
228        # load_from_csv()
229        load_from_csv()
230
231    def recover_expired_crawling(self, created_before=7200):
232        # def convert_from_crawling_to_to_crawl(urls):
233        #     # for url in urls:
234        #     #     self.collection.update_one(
235        #     #         {'_id':url['_id'], 'status': {'$in': ['crawling']}},
236        #     #         {'$set': {'status':'to_crawl'}}
237        #     #         )
238        #     # perform bulk update
239        #     if urls:
240        #         self.collection.update_many(
241        #             {'_id': {'$in': [url['_id'] for url in urls]}},
242        #             {'$set': {'status':'to_crawl'}}
243        #         )
244
245        # get items with status 'crawling' and before timestamp 2 hours (7200)
246        timestamp = time.time() - created_before  # 2 hours ago
247        
248        pipeline = [
249            {"$match": {"status": "crawling", "timestamp": {"$lt": str(timestamp)}}},
250            # {"$count": "count"}
251        ]
252        # The result is a list of documents returned by the aggregation pipeline
253        expired_crawling_urls = list(self.collection.aggregate(pipeline))
254        return expired_crawling_urls
255        # convert_from_crawling_to_to_crawl(expired_crawling_urls)
256
257    
258    def fetch_start_urls(self, number_of_urls_required=10):
259        # return [json.loads(url) for url in self.redis_client.srandmember('urls_to_crawl_cleaned_set', number_of_new_urls_required)]
260        
261        # Get all entries with  status 'to_crawl'
262        random_urls = list(self.collection.aggregate([
263            {"$match": {"status": "to_crawl"}},
264            {"$sample": {"size": number_of_urls_required}}
265        ]))
266        # urls = list(self.collection.find({'status':'to_crawl'}).limit(number_of_urls_required))
267
268        # perform bulk update
269        if random_urls:
270            self.collection.update_many(
271                {'_id': {'$in': [url['_id'] for url in random_urls]}},
272                {'$set': {'status':'crawling'}}
273            )
274        
275        return random_urls
276    def fetch_all(self):
277        return list(self.collection.find())
278    
279    def set_configs(self, configs):
280        '''
281        # Tests:
282            * variable types seems to be preserved
283            * if key exists, value is updated otherwise new key-value pair is created
284        # Format of configs
285        ```
286            configs = [
287                {'crawl_other_data': False},
288                {'crawl_paragraph_data':True},
289                {'some_config':1000}
290            ]
291        ```
292        '''
293
294        for config in configs:
295            self.db['config'].update_one(
296                {'name': list(config.keys())[0]},
297                {'$set': {'value': list(config.values())[0]}},
298                upsert=True
299            )
300    def get_configs(self):
301        '''
302        # Returns data of format:
303        ```
304            {
305                'crawl_other_data': False,
306                'crawl_paragraph_data': True
307            }
308        ```
309        '''
310        configs_data = self.db['config'].find({})
311        configs = {}
312        for config in configs_data:
313            configs[config['name']] = config['value']
314        return configs
315
316if __name__=="__main__":
317    # delete all data and create unique index for field: 'url'
318    mongo = Mongo()
319    
320    collection_names = ['url_crawled', 'url_to_crawl', 'url_crawling']
321    for collection_name in collection_names:
322        mongo.db_handler.delete_all(collection_name=collection_name)
323        mongo.db_handler.db[collection_name].create_index('url', unique=True)
324
325    
326    
327
328    # # local mongo
329    # # from mongo import Mongo
330    # local_mongo = Mongo(local=True)
331
332    # # Get all entries with limit 10
333    # print(local_mongo.find({}).limit(10))
334
335    # entries_count = local_mongo.collection.count_documents({})
336    # # get all entries 10000 at a time
337    # for i in range(0, entries_count, 10000):
338    #     entries = list(local_mongo.find({}).skip(i).limit(10000))
339    #     append_to_csv(entries)

src/spiders/quotes_spider.py

1import scrapy
2# scrapy crawl quotes_spider -o quotes.json
3class QuotesSpider(scrapy.Spider):
4    name = "quotes_spider"
5    start_urls = [
6        "https://quotes.toscrape.com/page/1/",
7        "https://quotes.toscrape.com/page/2/",
8        # "https://quotes.toscrape.com/page/3/",
9        # "https://quotes.toscrape.com/page/4/",
10        # "https://quotes.toscrape.com/page/5/",
11        # "https://quotes.toscrape.com/page/6/",
12        # "https://quotes.toscrape.com/page/7/",
13        # "https://quotes.toscrape.com/page/8/",
14        # "https://quotes.toscrape.com/page/9/",
15        # "https://quotes.toscrape.com/page/10/",
16    ]
17
18    def parse(self, response):
19        import os;print('\n\n\n\n', os.listdir(), os.environ.get('some_key2'), '\n\n')
20        # print(f"\n before {self.crawler.engine.slot.scheduler.stats._stats['scheduler/enqueued']} \n\n")
21        print(f"\n before2 {len(self.crawler.engine.slot.inprogress)}")  # dont know why it always returns zero
22        print(self.crawler.engine.settings.get('CONCURRENT_REQUESTS'))
23        print(len(self.crawler.engine.slot.inprogress) >= self.crawler.engine.settings.get('CONCURRENT_REQUESTS'))
24        for quote in response.css("div.quote"):
25            yield {
26                "text": quote.css("span.text::text").get(),
27                "author": quote.css("small.author::text").get(),
28                "tags": quote.css("div.tags a.tag::text").getall(),
29            }
30            # next_page = response.css("li.next a::attr(href)").get()
31            # if next_page is not None:
32            #     next_page = response.urljoin(next_page)
33            #     yield scrapy.Request(next_page, callback=self.parse)
34        # print(f"\n After {self.crawler.engine.slot.scheduler.stats._stats['scheduler/enqueued']} \n\n")
35        # print(f"\n after2 {len(self.crawler.engine.slot.scheduler)}") # dont know why it always returns zero
36    def spider_closed(self, spider, reason):
37        print(f"almost sakyo muji. reason:{reason}")
38        
39        # Restart spider  with more start urls
40        new_urls = get_new_start_urls()
41        for url in new_urls:
42            yield scrapy.Request(url, callback=self.parse)
43
44all_start_urls = [
45        # "https://quotes.toscrape.com/page/1/",
46        # "https://quotes.toscrape.com/page/2/",
47        "https://quotes.toscrape.com/page/3/",
48        "https://quotes.toscrape.com/page/4/",
49        "https://quotes.toscrape.com/page/5/",
50        "https://quotes.toscrape.com/page/6/",
51        "https://quotes.toscrape.com/page/7/",
52        "https://quotes.toscrape.com/page/8/",
53        "https://quotes.toscrape.com/page/9/",
54        "https://quotes.toscrape.com/page/10/"
55        ]
56
57def get_new_start_urls():
58    new_start_urls = random.sample(all_start_urls, 2)
59    for url in new_start_urls:
60        all_start_urls.remove(url)
61    return new_start_urls

src/spiders/title.py

1from __future__ import annotations
2
3from typing import Generator
4from urllib.parse import urljoin
5
6from scrapy import Request, Spider
7from scrapy.responsetypes import Response
8
9from ..items import TitleItem
10
11
12class TitleSpider(Spider):
13    """
14    Scrapes title pages and enqueues all links found on the page.
15    """
16
17    name = 'title_spider'
18
19    # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
20    # when the project is executed using Apify.
21    start_urls = ['https://apify.com/']
22
23    def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
24        """
25        Parse the web page response.
26
27        Args:
28            response: The web page response.
29
30        Yields:
31            Yields scraped TitleItem and Requests for links.
32        """
33        self.logger.info('TitleSpider is parsing %s...', response)
34        self.logger.info('TitleSpider is parsing %s...', response)
35
36        # Extract and yield the TitleItem
37        url = response.url
38        title = response.css('title::text').extract_first()
39        yield TitleItem(url=url, title=title)
40
41        # Extract all links from the page, create Requests out of them, and yield them
42        for link_href in response.css('a::attr("href")'):
43            link_url = urljoin(response.url, link_href.get())
44            if link_url.startswith(('http://', 'https://')):
45                yield Request(link_url)

src/spiders/worker_spider_v2.py

1from .functions import is_social_media_link, is_document_link, is_google_drive_link, is_same_domain, is_np_domain,is_special_domain_to_crawl, load_env_var_in_google_colab, remove_fragments_from_url, is_nepali_language, is_valid_text_naive, is_document_or_media
2# import pybloom_live
3import scrapy
4import dotenv
5# import json
6# import os
7# import redis
8# import threading
9import time
10
11# from scrapy import signals# , Spider
12from scrapy.linkextractors import LinkExtractor
13from .mongo import Mongo
14
15from scrapy.spidermiddlewares.httperror import HttpError
16from twisted.internet.error import DNSLookupError, TCPTimedOutError, TimeoutError
17
18
19class WorkerSpider(scrapy.Spider):
20    name = "worker_spider"
21    crawled_data = []
22    to_visit = []
23    other_data = []
24    
25            
26    def __init__(self, *args, **kwargs):
27        # load environment variables if running in google colab
28        load_env_var_in_google_colab()
29        # dotenv.load_dotenv("server/.env")
30
31        # super(EkantipurSpider, self).__init__(*args, **kwargs)
32        self.mongo = Mongo()
33        
34        # ----------------------------------------------------------------------------
35        # config variable from mongo db to decide whether or not to crawl other_data
36        print('------------------------------------')
37        
38        # Get configs
39        configs = self.mongo.get_configs()
40        
41        # default: Do not Crawl other data
42        self.crawl_other_data = configs['crawl_other_data'] if 'crawl_other_data' in configs else False
43        print(f'config.crawl_other_data: {self.crawl_other_data}')
44        
45        # default: Crawl paragraph data
46        self.crawl_paragraph_data = configs['crawl_paragraph_data'] if 'crawl_paragraph_data' in configs else True
47        print(f'config.crawl_paragraph_data: {self.crawl_paragraph_data}')
48        print('------------------------------------')
49        # ----------------------------------------------------------------------------
50
51        # self.redis_client = redis.Redis(
52        #     host=os.environ.get('REDIS_HOST', 'localhost'),
53        #     port = int(os.environ.get('REDIS_PORT', 6379)),
54        #     password=os.environ.get('REDIS_PASSWORD', None),
55        # )
56        
57        # self.start_urls = self.fetch_start_urls()
58        # self.start_urls = [data_item['url'] for data_item in self.mongo.fetch_start_urls()]
59        # print(f'\n\n start_urls: {self.start_urls} \n\n')
60        # for url in self.start_urls:
61        #     yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin)  # , dont_filter=True  : allows visiting same url again
62
63        # using bloom filter to store visited urls for faster lookup
64        # self.visited_urls = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.LARGE_SET_GROWTH)    # pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH means
65    
66    # def fetch_start_urls(self, number_of_new_urls_required=10):
67    #     return [json.loads(url) for url in self.redis_client.srandmember('urls_to_crawl_cleaned_set', number_of_new_urls_required)]
68    def start_requests(self):
69        n_concurrent_requests = self.crawler.engine.settings.get('CONCURRENT_REQUESTS')
70        start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(n_concurrent_requests)]
71        print(f'\n\n start:{start_urls} \n\n')
72        for url in start_urls:
73            yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin)  # , dont_filter=True  : allows visiting same url again
74    def parse(self, response):
75        if 'text/html' not in response.headers.get('Content-Type', '').decode('utf-8'):
76            # response is not text/html (its probably some document)
77            # save the link to mongo
78            self.mongo.db['other_data'].insert_one(
79                {
80                    'url': response.url,
81                    'Content-Type': response.headers.get('Content-Type', '').decode('utf-8'),
82                }
83            )
84            return
85        links = LinkExtractor(deny_extensions=[]).extract_links(response)
86        drive_links = []    # Drive links
87        site_links = []     # website links
88        
89        # yield every heading, paragraph from current page
90        headings_and_paragraphs = response.css('h1::text, h2::text, h3::text, h4::text, h5::text, h6::text, p::text').getall()
91        if self.crawl_paragraph_data:
92            # print('-------------------------------------------------')
93            # print(f'\ncrawling paragraph_data: {self.crawl_paragraph_data}')
94            # print('-------------------------------------------------')
95            # Crawl Paragraph Data only if config is set to True
96            the_crawled_data = []   # list for bulk upload
97            for paragraph in headings_and_paragraphs:
98                is_nepali, confidence = is_nepali_language(paragraph)
99                if is_nepali and is_valid_text_naive(paragraph):   # implement a way to detect nepali language (langid is not reliable)
100                    # Save crawled data to redis
101                    # self.crawled_data.append(
102                    the_crawled_data.append({
103                            'parent_url': response.url,
104                            'page_title': response.css('title::text').get(),
105                            'paragraph': paragraph,
106                            # 'is_nepali_confidence': confidence
107                        })
108                    # self.mongo.db['crawled_data'].insert_one(the_crawled_data)
109                    '''
110                    # Old Code using Redis: Redis turned out to be too slow for concurrent processes
111                    self.redis_client.lpush('crawled_data', json.dumps(the_crawled_data))
112                    '''
113            if the_crawled_data:self.mongo.db['crawled_data'].insert_many(the_crawled_data)
114        else:
115            pass
116            # print('-------------------------------------------------')
117            # print(f'Avoided crawling paragraph_data: {not self.crawl_paragraph_data}')
118            # print('-------------------------------------------------')
119        # Saving drive links
120        if self.crawl_other_data:
121            # print('-------------------------------------------------')
122            # print(f'\ncrawling_other_data: {self.crawl_other_data}')
123            # print('-------------------------------------------------')
124            # config['crawl_other_data'] is True
125            # So crawl other data
126            the_other_data =  []    # list for bulk upload
127            for link in links:
128                is_drive_link, _ = is_google_drive_link(link.url)     #DriveFunctions.is_google_drive_link(link.url)
129                if is_drive_link:
130                    the_other_data.append({
131                            'parent_url': response.url,
132                            'url': link.url,
133                            'text': link.text,
134                            'link_type': "drive_link",
135                            'link_description': None
136                    })
137                    # self.mongo.db['other_data'].insert_one(the_other_data)
138                    '''
139                    # Old Code using Redis: Redis turned out to be too slow for concurrent processes
140                    # # self.other_data.append({
141                    self.redis_client.lpush('other_data', json.dumps(the_other_data))
142                    '''
143                else:
144                    is_document, document_type, ignore_doc = is_document_link(link.url)
145                    if is_document:
146                        if not ignore_doc:  # and doc_type != 'image'
147                            the_other_data.append({
148                                'parent_url': response.url,
149                                'url': link.url,
150                                'text': link.text,
151                                'link_type': "document",
152                                'link_description': document_type
153                            })
154                            # self.mongo.db['other_data'].insert_one(the_other_data)
155
156                            '''
157                            # Old Code using Redis: Redis turned out to be too slow for concurrent processes
158                            # self.other_data.append({
159                            self.redis_client.lpush('other_data', json.dumps(the_other_data))
160                            '''
161                    else:
162                        is_social_media, social_media_type = is_social_media_link(link.url)
163                        if is_social_media:
164                            the_other_data.append({
165                                'parent_url': response.url,
166                                'url': link.url,
167                                'text': link.text,
168                                'link_type': "social_media",
169                                'link_description': social_media_type
170                            })
171                            # self.mongo.db['other_data'].insert_one(the_other_data)
172                            '''
173                            # Old Code using Redis: Redis turned out to be too slow for concurrent processes
174                            self.redis_client.lpush('other_data', json.dumps())
175                            '''
176                        else:
177                            site_links.append(link)
178            if the_other_data: self.mongo.db['other_data'].insert_many(the_other_data)
179        else:
180            # print('-------------------------------------------------')
181            # print(f'Avoided crawling_other_data: {not self.crawl_other_data}')
182            # print('-------------------------------------------------')
183            # config['crawl_other_data'] is False
184            # So do not crawl other data
185            for link in links:
186                # To avoid crawling documents or social media links
187                
188                if not is_document_or_media(link.url):
189                    site_links.append(link)
190
191        # Next Page to Follow: 
192        the_to_crawl_urls = []    # list for bulk upload
193        for site_link in site_links:
194            # print(f' \n muji site link: {site_link.url} \n')
195            # base_url, crawl_it = should_we_crawl_it(site_link.url)  # self.visited_urls_base,
196            de_fragmented_url = remove_fragments_from_url(site_link.url)
197            if (is_np_domain(de_fragmented_url) or is_special_domain_to_crawl(de_fragmented_url)): # and de_fragmented_url not in self.visited_urls:
198                # is_same_domain(response.url, de_fragmented_url) or    # it is a problem because attempting to crawl bbc.com/nepali can lead to bbc.com
199                # only follow urls from same domain or other nepali domain and not visited yet
200                # if len(self.crawler.engine.slot.inprogress) >= self.crawler.engine.settings.get('CONCURRENT_REQUESTS'):
201                #     # send new urls_to_crawl to redis.
202                #     # self.redis_client.sadd('url_to_crawl', json.dumps(de_fragmented_url))
203                #     # self.mongo.append_url_to_crawl(de_fragmented_url)
204                
205                # self.mongo.append_url_to_crawl(de_fragmented_url)
206                if len(de_fragmented_url.strip()) > 0 and len(de_fragmented_url) < 1000:
207                    # avoid urls with length greater than 1000
208                    the_to_crawl_urls.append(de_fragmented_url)
209                
210                # else:
211                #     if self.mongo.append_url_crawling(de_fragmented_url):
212                #         # yield url to crawl new urls_to_crawl by itself
213                #         yield scrapy.Request(de_fragmented_url, callback=self.parse, errback=self.errback_httpbin)  # dont_filter=True  : allows visiting same url again
214                    
215                #         # self.to_visit.append(de_fragmented_url)
216                        
217                #         # Send crawling notice to server
218                #         # self.redis_client.sadd('url_crawling', json.dumps(de_fragmented_url))
219        if the_to_crawl_urls: self.mongo.append_url_to_crawl(the_to_crawl_urls)
220        # append crawled url to visited urls
221        # self.mongo.delete_to_crawl(response.request.url)
222        ''' Note:
223            If a url redirects to another url, then the original url is added to visited urls so as to not to visit it again.
224            redirected url is sent via crawled_data and other_data then update visited_url.
225        '''
226
227        # Keep the worker buisy by getting new urls to crawl
228        # Get few more start urls to crawl next
229        number_of_new_urls_required = self.crawler.engine.settings.get('CONCURRENT_REQUESTS') - len(self.crawler.engine.slot.inprogress)
230        if number_of_new_urls_required > 0:
231            n_concurrent_requests = self.crawler.engine.settings.get('CONCURRENT_REQUESTS')
232            fetched_start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(n_concurrent_requests)]
233            # fetched_start_urls = [remove_fragments_from_url(data_item['url']) for data_item in self.mongo.fetch_start_urls(number_of_new_urls_required)]
234            if fetched_start_urls:
235                for url in fetched_start_urls:
236                        # if url not in self.visited_urls:  # not necessary as we are fetching from mongo
237                        # self.visited_urls.add(url)
238                        # print(url)
239                        yield scrapy.Request(url, callback=self.parse)
240
241    def errback_httpbin(self, failure):
242        # log all failures
243        self.logger.error(repr(failure))
244
245        if failure.check(HttpError):
246            response = failure.value.response
247            self.logger.error('HttpError on %s', response.url)
248
249            error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'HttpError'}
250            self.mongo.append_error_data(error_data)
251            # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')
252
253            if response.status in [400, 401, 403, 404, 405, 408, 429, 500, 502, 503, 504]:
254                self.logger.error('Got %s response. URL: %s', response.status, response.url)
255                # handle the error here
256
257        elif failure.check(DNSLookupError):
258            request = failure.request
259            self.logger.error('DNSLookupError on %s', request.url)
260            
261            # Save error data on mongodb
262            error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'DNSLookupError'}
263            self.mongo.append_error_data(error_data)
264            # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')
265
266        elif failure.check(TCPTimedOutError, TimeoutError):
267            request = failure.request
268            self.logger.error('TimeoutError on %s', request.url)
269
270            # Save error data on mongodb
271            error_data = {'url':response.url, 'timestamp':time.time(), 'status':'error', 'status_code':response.status, 'error_type':'TimeoutError'}
272            self.mongo.append_error_data(error_data)
273            # print(f'\n\n\n\n{error_data}\n\n\n\n\n\n')
274        else:
275            self.logger.error('Unknown error on %s', failure.request.url)
276
277            # Save error data on mongodb
278            error_data = {'url': failure.request.url, 'timestamp': time.time(), 'status': 'error', 'status_code': response.status, 'error_type': 'Unknown'}
279            # print(f'\n\n oh oo.. Unknown error : {error_data}')
280            self.mongo.append_error_data(error_data)

src/main.py

1"""
2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
7`scrapy crawl title_spider`.
8
9We recommend you do not modify this file unless you really know what you are doing.
10"""
11
12# We need to configure the logging first before we import anything else, so that nothing else imports
13# `scrapy.utils.log` before we patch it.
14from __future__ import annotations
15from logging import StreamHandler, getLogger
16from typing import Any
17from scrapy.utils import log as scrapy_logging
18from scrapy.utils.project import get_project_settings
19from apify.log import ActorLogFormatter
20
21# Define names of the loggers.
22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
28# a specific logger, do it in this file.
29settings = get_project_settings()
30LOGGING_LEVEL = settings['LOG_LEVEL']
31
32# Define a logging handler which will be used for the loggers.
33apify_handler = StreamHandler()
34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
38    """
39    Configure a logger with the specified settings.
40
41    Args:
42        logger_name: The name of the logger to be configured.
43        log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44        handlers: Optional list of logging handlers.
45    """
46    logger = getLogger(logger_name)
47    logger.setLevel(log_level)
48    logger.handlers = []
49
50    for handler in handlers:
51        logger.addHandler(handler)
52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
55# the `main.py` and Scrapy components.
56for logger_name in MAIN_LOGGER_NAMES:
57    configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
63# otherwise we would lose some log messages.
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68    """
69    We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
70    logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
71    loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
72    these four loggers and the root logger.
73    """
74    old_configure_logging(*args, **kwargs)
75
76    # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
77    # property within spiders. See details in the Spider logger property:
78    # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
79    configure_logger(None, LOGGING_LEVEL, apify_handler)
80
81    # We modify other loggers only by setting up their log level. A custom log handler is added
82    # only to the root logger to avoid duplicate log messages.
83    for logger_name in ALL_LOGGER_NAMES:
84        configure_logger(logger_name, LOGGING_LEVEL)
85
86    # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
87    # messages, especially when running on the platform.
88    configure_logger('httpx', 'WARNING')
89
90
91scrapy_logging.configure_logging = new_configure_logging
92
93# Now we can do the rest of the setup
94import asyncio
95import os
96import nest_asyncio
97from scrapy.utils.reactor import install_reactor
98from .main import main
99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,
103# otherwise, it will not work correctly on Windows.
104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
105nest_asyncio.apply()
106
107# Specify the path to the Scrapy project settings module
108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
109
110# Run the Apify main coroutine
111asyncio.run(main())

src/items.py

1"""
2Scrapy item models module
3
4This module defines Scrapy item models for scraped data. Items represent structured data
5extracted by spiders.
6
7For detailed information on creating and utilizing items, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/items.html
9"""
10
11from scrapy import Field, Item
12
13
14class TitleItem(Item):
15    """
16    Represents a title item scraped from a web page.
17    """
18
19    url = Field()
20    title = Field()

src/main.py

1"""
2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine
3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by
4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline
5for pushing data to the Apify dataset.
6
7Customization:
8--------------
9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy
11components like spiders and handling Actor input. However, make sure you have a clear understanding of your
12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.
13
14Documentation:
15--------------
16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and
18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.
19"""
20
21from __future__ import annotations
22
23from scrapy.crawler import CrawlerProcess
24
25from apify import Actor
26from apify.scrapy.utils import apply_apify_settings
27
28# Import your Scrapy spider here
29# from .spiders.title import TitleSpider as Spider
30from .spiders.worker_spider_v2 import WorkerSpider as Spider
31# from .spiders.quotes_spider import QuotesSpider as Spider
32# Default input values for local execution using `apify run`
33LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
34
35import os
36async def main() -> None:
37    """
38    Apify Actor main coroutine for executing the Scrapy spider.
39    """
40    async with Actor:
41        Actor.log.info('....................................')
42        Actor.log.info('Actor is being executed...')
43        Actor.log.info(f'env: {os.environ.get("test")}')
44        Actor.log.info('....................................')
45
46        # Process Actor input
47        actor_input = await Actor.get_input() or {}
48        start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
49        proxy_config = actor_input.get('proxyConfiguration')
50
51        # Add start URLs to the request queue
52        rq = await Actor.open_request_queue()
53        for start_url in start_urls:
54            url = start_url.get('url')
55            await rq.add_request(request={'url': url, 'method': 'GET'})
56
57        # Apply Apify settings, it will override the Scrapy project settings
58        settings = apply_apify_settings(proxy_config=proxy_config)
59
60        # Execute the spider using Scrapy CrawlerProcess
61        process = CrawlerProcess(settings, install_root_handler=False)
62        process.crawl(Spider)
63        process.start()

src/middlewares.py

1"""
2Scrapy middlewares module
3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
5responses, typically used for adding custom headers, retrying requests, and handling exceptions.
6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information
8on creating and utilizing them, refer to the official documentation:
9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11"""
12
13from __future__ import annotations
14from typing import Generator, Iterable
15
16from scrapy import Request, Spider, signals
17from scrapy.crawler import Crawler
18from scrapy.http import Response
19
20# useful for handling different item types with a single interface
21from itemadapter import is_item, ItemAdapter
22
23
24class TitleSpiderMiddleware:
25    # Not all methods need to be defined. If a method is not defined,
26    # scrapy acts as if the spider middleware does not modify the
27    # passed objects.
28
29    @classmethod
30    def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:
31        # This method is used by Scrapy to create your spiders.
32        s = cls()
33        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
34        return s
35
36    def process_spider_input(self, response: Response, spider: Spider) -> None:
37        # Called for each response that goes through the spider
38        # middleware and into the spider.
39
40        # Should return None or raise an exception.
41        return None
42
43    def process_spider_output(
44        self,
45        response: Response,
46        result: Iterable,
47        spider: Spider,
48    ) -> Generator[Iterable[Request] | None, None, None]:
49        # Called with the results returned from the Spider, after
50        # it has processed the response.
51
52        # Must return an iterable of Request, or item objects.
53        for i in result:
54            yield i
55
56    def process_spider_exception(
57        self,
58        response: Response,
59        exception: BaseException,
60        spider: Spider,
61    ) -> Iterable[Request] | None:
62        # Called when a spider or process_spider_input() method
63        # (from other spider middleware) raises an exception.
64
65        # Should return either None or an iterable of Request or item objects.
66        pass
67
68    def process_start_requests(
69        self, start_requests: Iterable[Request], spider: Spider
70    ) -> Iterable[Request]:  # Called with the start requests of the spider, and works
71        # similarly to the process_spider_output() method, except
72        # that it doesn’t have a response associated.
73
74        # Must return only requests (not items).
75        for r in start_requests:
76            yield r
77
78    def spider_opened(self, spider: Spider) -> None:
79        pass
80
81
82class TitleDownloaderMiddleware:
83    # Not all methods need to be defined. If a method is not defined,
84    # scrapy acts as if the downloader middleware does not modify the
85    # passed objects.
86
87    @classmethod
88    def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:
89        # This method is used by Scrapy to create your spiders.
90        s = cls()
91        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
92        return s
93
94    def process_request(self, request: Request, spider: Spider) -> Request | Response | None:
95        # Called for each request that goes through the downloader
96        # middleware.
97
98        # Must either:
99        # - return None: continue processing this request
100        # - or return a Response object
101        # - or return a Request object
102        # - or raise IgnoreRequest: process_exception() methods of
103        #   installed downloader middleware will be called
104        return None
105
106    def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
107        # Called with the response returned from the downloader.
108
109        # Must either;
110        # - return a Response object
111        # - return a Request object
112        # - or raise IgnoreRequest
113        return response
114
115    def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:
116        # Called when a download handler or a process_request()
117        # (from other downloader middleware) raises an exception.
118
119        # Must either:
120        # - return None: continue processing this exception
121        # - return a Response object: stops process_exception() chain
122        # - return a Request object: stops process_exception() chain
123        pass
124
125    def spider_opened(self, spider: Spider) -> None:
126        pass

src/pipelines.py

1"""
2Scrapy item pipelines module
3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
5that handle the scraped items, typically used for cleaning, validating, and persisting data.
6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:
8http://doc.scrapy.org/en/latest/topics/item-pipeline.html
9"""
10
11from scrapy import Spider
12
13from .items import TitleItem
14
15
16class TitleItemPipeline:
17    """
18    This item pipeline defines processing steps for TitleItem objects scraped by spiders.
19    """
20
21    def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
22        # Do something with the item here, such as cleaning it or persisting it to a database
23        return item

src/settings.py

1# Scrapy settings for scrapy_engine project
2#
3# For simplicity, this file contains only settings considered important or
4# commonly used. You can find more settings consulting the documentation:
5#
6#     https://docs.scrapy.org/en/latest/topics/settings.html
7#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
10BOT_NAME = "scrapy_engine"
11
12# ----------------------------------------------------
13# code from default_spider
14# BOT_NAME = 'titlebot'
15# DEPTH_LIMIT = 1
16LOG_LEVEL = 'INFO'
17REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
18ROBOTSTXT_OBEY = False
19
20NEWSPIDER_MODULE = 'src.spiders'
21SPIDER_MODULES = ['src.spiders']
22# ITEM_PIPELINES = {
23#     'src.pipelines.TitleItemPipeline': 123,
24# }
25# SPIDER_MIDDLEWARES = {
26#     'src.middlewares.TitleSpiderMiddleware': 543,
27# }
28# DOWNLOADER_MIDDLEWARES = {
29#     'src.middlewares.TitleDownloaderMiddleware': 543,
30# }
31# ----------------------------------------------------
32
33
34# # Crawl responsibly by identifying yourself (and your website) on the user-agent
35# #USER_AGENT = "scrapy_engine (+http://www.yourdomain.com)"
36
37# # Obey robots.txt rules
38# ROBOTSTXT_OBEY = False
39
40# # Configure maximum concurrent requests performed by Scrapy (default: 16)
41# CONCURRENT_REQUESTS =  15 # 10 for replit # 32
42
43# # Configure a delay for requests for the same website (default: 0)
44# # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
45# # See also autothrottle settings and docs
46# DOWNLOAD_DELAY = 1 # 3
47# # The download delay setting will honor only one of:
48# # CONCURRENT_REQUESTS_PER_DOMAIN = 16
49# #CONCURRENT_REQUESTS_PER_IP = 16
50
51# # Disable cookies (enabled by default)
52# COOKIES_ENABLED = False
53
54# # Disable Telnet Console (enabled by default)
55# #TELNETCONSOLE_ENABLED = False
56
57# # Override the default request headers:
58# #DEFAULT_REQUEST_HEADERS = {
59# #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
60# #    "Accept-Language": "en",
61# #}
62
63# # Enable or disable spider middlewares
64# # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
65# #SPIDER_MIDDLEWARES = {
66# #    "scrapy_engine.middlewares.ScrapyEngineSpiderMiddleware": 543,
67# #}
68
69# # Enable or disable downloader middlewares
70# # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
71# #DOWNLOADER_MIDDLEWARES = {
72# #    "scrapy_engine.middlewares.ScrapyEngineDownloaderMiddleware": 543,
73# #}
74
75# # Enable or disable extensions
76# # See https://docs.scrapy.org/en/latest/topics/extensions.html
77# #EXTENSIONS = {
78# #    "scrapy.extensions.telnet.TelnetConsole": None,
79# #}
80
81# # Configure item pipelines
82# # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
83# #ITEM_PIPELINES = {
84# #    "scrapy_engine.pipelines.ScrapyEnginePipeline": 300,
85# #}
86
87# # Enable and configure the AutoThrottle extension (disabled by default)
88# # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
89# AUTOTHROTTLE_ENABLED = True # dynamically adjusts the download delay based on server responses. 
90# # The initial download delay
91# AUTOTHROTTLE_START_DELAY = 2#5
92# # The maximum download delay to be set in case of high latencies
93# AUTOTHROTTLE_MAX_DELAY = 30# 60
94# # The average number of requests Scrapy should be sending in parallel to
95# # each remote server
96# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
97# # Enable showing throttling stats for every response received:
98# #AUTOTHROTTLE_DEBUG = False
99
100# # Enable and configure HTTP caching (disabled by default)
101# # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102# HTTPCACHE_ENABLED = False   # True
103# #HTTPCACHE_EXPIRATION_SECS = 0
104# #HTTPCACHE_DIR = "httpcache"
105# #HTTPCACHE_IGNORE_HTTP_CODES = []
106# #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
107
108# # Set settings whose default value is deprecated to a future-proof value
109# REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
110# TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
111# FEED_EXPORT_ENCODING = "utf-8"
112
113
114# LOG_LEVEL='INFO'    # overview of program execution
115# #LOG_LEVEL='DEBUG'   # detailed information about program execution
116# #LOG_LEVEL='WARNING' # only show warnings and errors
117# # LOG_LEVEL='ERROR'   # only show errors
118# #LOG_LEVEL='CRITICAL' # only show critical errors
119# #LOG_LEVEL='NOTSET'  # show everything
120
121
122
123
124
125# # ## ---------------------------------------------------
126# # ## ---------------- REDIS-SPECIFIC SETTINGS -----------
127# # ## ---------------------------------------------------
128# # # Enables scheduling storing requests queue in redis.
129# # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
130
131# # # Ensure all spiders share same duplicates filter through redis.
132# # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
133
134# # # Redis Connection URL
135# # REDIS_URL = 'redis://default:saQznB445IS0AEypcHVpzLKtGTxXilui@redis-13950.c292.ap-southeast-1-1.ec2.cloud.redislabs.com:13950'

src/settings_default.py

1"""
2Scrapy settings module
3
4This module contains Scrapy settings for the project, defining various configurations and options.
5
6For more comprehensive details on Scrapy settings, refer to the official documentation:
7http://doc.scrapy.org/en/latest/topics/settings.html
8"""
9
10# You can update these options and add new ones
11BOT_NAME = 'titlebot'
12DEPTH_LIMIT = 1
13LOG_LEVEL = 'INFO'
14NEWSPIDER_MODULE = 'src.spiders'
15REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
16ROBOTSTXT_OBEY = True
17SPIDER_MODULES = ['src.spiders']
18ITEM_PIPELINES = {
19    'src.pipelines.TitleItemPipeline': 123,
20}
21SPIDER_MIDDLEWARES = {
22    'src.middlewares.TitleSpiderMiddleware': 543,
23}
24DOWNLOADER_MIDDLEWARES = {
25    'src.middlewares.TitleDownloaderMiddleware': 543,
26}

.dockerignore

# Git folder
.git

# IDE
.idea/
.vscode/
.DS_Store

# Apify storage folder
storage/

# Python virtual environment
.venv/
.env/

# Python (and python tools) cache files
__pycache__/
*.pyc
.ruff_cache/
.mypy_cache/
.pytest_cache/

# Python build files
__pypackages__/
dist/
build/
*.egg-info/
*.egg

# log files
*.log

.editorconfig

root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.gitignore

# IDE
.idea/
.vscode/
.DS_Store

# Apify storage folder
storage/

# Python virtual environment
.venv/
.env/

# Python (and python tools) cache files
__pycache__/
*.pyc
.ruff_cache/
.mypy_cache/
.pytest_cache/

# Python build files
__pypackages__/
dist/
build/
*.egg-info/
*.egg

# log files
*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify[scrapy] ~= 1.7.0
5nest-asyncio ~= 1.5.8
6scrapy ~= 2.11.1
7
8
9
10langid==1.1.6
11pymongo==4.6.1
12python-dotenv==1.0.0
13# redis==5.0.1
14requests==2.31.0
15Twisted==22.10.0

scrapy.cfg

[settings]
default = src.settings

[deploy]
project = src

Dataset Processor in Python

drobnikj/dataset-processor-python

This actor utilizes Python to process the dataset.

Jakub Drobník

Dataset Query Engine

jiri.spilka/dataset-query-engine

Use natural language queries to retrieve results from an Apify dataset. This Actor provides a query engine that loads a dataset, executes SQL queries, and synthesizes results.

Jiří Spilka

4.6

XMLs To Dataset

mtrunkat/xmls-to-dataset

Go to actor anytime you need to download XML files and store them in the dataset.

Marek Trunkát

102

Dataset2GPT

flamboyant_leaf/dataset2gpt

AIRabbit

Sort Dataset Items

lukaskrivka/sort-dataset-items

Add this actor as a webhook to your scraper to sort the dataset by index field

Lukáš Křivka

CSV File to Dataset

lukaskrivka/csv-file-to-dataset

Upload a local or remote CSV/text file and convert it to Apify Dataset for further use.

Lukáš Křivka

Append to dataset

valek.josef/append-to-dataset

Utility actor that allows you to build a single large dataset from individual default datasets of other actor runs.

Josef Válek

Ultimate News API

glitch_404/Ultimate-News-Scraper

news scraper to scrape up to 10K news articles from over 4500 news sources in less than 20 minutes news from over 20 categories .e.g. Crypto news, World News, Latest News, Celebrities News, and a lot more. you can get news from websites like Fox News, BBC News, CNN News, Crypto and Cryptocurrencies.

Yousif Wael

110

Google Dataset Items Translator

web.harvester/google-dataset-items-translator

Translate any dataset field(s) to any of the supported languages using the Google Translate website, it goes through all the items in the dataset and translates all of the selected fields

Web Harvester

Hacker News Top Sites Scraper

fearless_sharpener/hacker-news-top-sites-scraper

Unlock even more valuable insights from the popular news website, Hacker News. Our updated Top Sites Scraper extracts titles, scores, links and more, all in one easy-to-use Apify actor. With this powerful tool, you can gather comprehensive data from Hacker News and gain a competitive edge.