news-scraper
Deprecated
Pricing
Pay per usage
Go to Store
news-scraper
Deprecated
Input a list of news URLs to fetch news details - News Headline, Subtitle, Date, Content, etc.
0.0 (0)
Pricing
Pay per usage
0
Total users
5
Monthly users
5
Last modified
10 months ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "my-actor", "title": "Getting started with Python and Scrapy", "description": "Scrapes titles of websites using Scrapy.", "version": "0.0", "meta": { "templateId": "python-scrapy" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Python Scrapy Scraper", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ { "url": "https://apify.com" } ], "editor": "requestListSources" }, "proxyConfiguration": { "sectionCaption": "Proxy and HTTP configuration", "title": "Proxy configuration", "type": "object", "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", "editor": "proxy", "prefill": { "useApifyProxy": true }, "default": { "useApifyProxy": true } } }, "required": ["startUrls"]}
src/newspapers/aajtak.in.json
{ "website": "aajtak.in", "channel": "Aaj Tak", "language": { "": "Hindi", "bangla": "Bangla" }, "author": { "text": "//div[@class=\"brand-detial-main\"]/h4" }, "content": { "text": "div[class=\"story-with-main-sec\"]", "p": true, "skip_p": [ "ये भी पढ़ें" ] }, "image": { "text": "div[class=\"content-area\"]" }, "subtitle": { "text": "h2[id=\"copy_true\"]" }, "title": { "text": "h1" }, "uploaddate": [ { "text": "meta[property=\"article:published_time\"]", "to_datetime": "" }, { "text": "div[class=\"brand-detial-main\"]", "re": "(d{1,2}) (month) (d{4})", "order": [ 2, 1, 0 ] } ]}
src/spiders/__init__.py
1"""2Scrapy spiders package3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape5and process data from websites.6
7For detailed information on creating and utilizing spiders, refer to the official documentation:8https://docs.scrapy.org/en/latest/topics/spiders.html9"""
src/spiders/title.py
1from __future__ import annotations2
3from typing import Generator4from urllib.parse import urljoin5from ..news_functions import *6from scrapy import Request, Spider7from scrapy.responsetypes import Response8
9from ..items import TitleItem10import json11import os12
13
14class TitleSpider(Spider):15 """16 Scrapes title pages and enqueues all links found on the page.17 """18
19 name = 'title_spider'20
21 # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input22 # when the project is executed using Apify.23 start_urls = []24
25 def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:26 """27 Parse the web page response.28
29 Args:30 response: The web page response.31
32 Yields:33 Yields scraped TitleItem and Requests for links.34 """35 self.logger.info('TitleSpider is parsing %s...', response)36
37 # Extract and yield the TitleItem38 url = response.url39 website = get_website(url)40 x = json.load(open('/usr/src/app/src/newspapers/'+website+'.json','r'))41 mongo_dict = {"url": response.url}42 for field in ['uploaddate','title','subtitle','author','content']:43 if x and field in x:44 mongo_dict[field] = x[field]45 continue46 if field not in x:47 continue48 try:49 print('field',field)50 get_field('Scrapy',response, x, mongo_dict,field)51 if field in mongo_dict and mongo_dict[field] in ['', None]:52 del mongo_dict[field]53 except (IndexError,ValueError,scrapy.exceptions.NotSupported):54 pass55 yield mongo_dict
src/__main__.py
1"""2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using7`scrapy crawl title_spider`.8
9We recommend you do not modify this file unless you really know what you are doing.10"""11
12# We need to configure the logging first before we import anything else, so that nothing else imports13# `scrapy.utils.log` before we patch it.14from __future__ import annotations15from logging import StreamHandler, getLogger16from typing import Any17from scrapy.utils import log as scrapy_logging18from scrapy.utils.project import get_project_settings19from apify.log import ActorLogFormatter20
21# Define names of the loggers.22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for28# a specific logger, do it in this file.29settings = get_project_settings()30LOGGING_LEVEL = settings['LOG_LEVEL']31
32# Define a logging handler which will be used for the loggers.33apify_handler = StreamHandler()34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:38 """39 Configure a logger with the specified settings.40
41 Args:42 logger_name: The name of the logger to be configured.43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).44 handlers: Optional list of logging handlers.45 """46 logger = getLogger(logger_name)47 logger.setLevel(log_level)48 logger.handlers = []49
50 for handler in handlers:51 logger.addHandler(handler)52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from55# the `main.py` and Scrapy components.56for logger_name in MAIN_LOGGER_NAMES:57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because63# otherwise we would lose some log messages.64old_configure_logging = scrapy_logging.configure_logging65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:68 """69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here72 these four loggers and the root logger.73 """74 old_configure_logging(*args, **kwargs)75
76 # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`77 # property within spiders. See details in the Spider logger property:78 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.79 configure_logger(None, LOGGING_LEVEL, apify_handler)80
81 # We modify other loggers only by setting up their log level. A custom log handler is added82 # only to the root logger to avoid duplicate log messages.83 for logger_name in ALL_LOGGER_NAMES:84 configure_logger(logger_name, LOGGING_LEVEL)85
86 # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless87 # messages, especially when running on the platform.88 configure_logger('httpx', 'WARNING')89
90
91scrapy_logging.configure_logging = new_configure_logging92
93# Now we can do the rest of the setup94import asyncio95import os96import nest_asyncio97from scrapy.utils.reactor import install_reactor98from .main import main99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,103# otherwise, it will not work correctly on Windows.104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')105nest_asyncio.apply()106
107# Specify the path to the Scrapy project settings module108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'109
110# Run the Apify main coroutine111asyncio.run(main())
src/items.py
1"""2Scrapy item models module3
4This module defines Scrapy item models for scraped data. Items represent structured data5extracted by spiders.6
7For detailed information on creating and utilizing items, refer to the official documentation:8https://docs.scrapy.org/en/latest/topics/items.html9"""10
11from scrapy import Field, Item12import scrapy13
14
15class TitleItem(Item):16 """17 Represents a title item scraped from a web page.18 """19
20 url = scrapy.Field()21 header_link = scrapy.Field()22 website = scrapy.Field()23 date_added = scrapy.Field()24 last_updated = scrapy.Field()25 channel = scrapy.Field()26 title = scrapy.Field()27 subtitle = scrapy.Field()28 image = scrapy.Field()29 author = scrapy.Field()30 uploaddate = scrapy.Field()31 content = scrapy.Field()32 views = scrapy.Field()33 comments = scrapy.Field()34 tags = scrapy.Field()35 likes = scrapy.Field()36 dislikes = scrapy.Field()37 news_country = scrapy.Field()38 news_region = scrapy.Field()39 update = scrapy.Field()
src/main.py
1"""2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline5for pushing data to the Apify dataset.6
7Customization:8--------------9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy11components like spiders and handling Actor input. However, make sure you have a clear understanding of your12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.13
14Documentation:15--------------16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.19"""20
21from __future__ import annotations22
23from scrapy.crawler import CrawlerProcess24
25from apify import Actor26from apify.scrapy.utils import apply_apify_settings27
28# Import your Scrapy spider here29from .spiders.title import TitleSpider as Spider30
31# Default input values for local execution using `apify run`32LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]33
34
35async def main() -> None:36 """37 Apify Actor main coroutine for executing the Scrapy spider.38 """39 async with Actor:40 Actor.log.info('Actor is being executed...')41
42 # Process Actor input43 actor_input = await Actor.get_input() or {}44 start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)45 proxy_config = actor_input.get('proxyConfiguration')46
47 # Add start URLs to the request queue48 rq = await Actor.open_request_queue()49 for start_url in start_urls:50 url = start_url.get('url')51 await rq.add_request(request={'url': url, 'method': 'GET'})52
53 # Apply Apify settings, it will override the Scrapy project settings54 settings = apply_apify_settings(proxy_config=proxy_config)55
56 # Execute the spider using Scrapy CrawlerProcess57 process = CrawlerProcess(settings, install_root_handler=False)58 process.crawl(Spider)59 process.start()
src/middlewares.py
1"""2Scrapy middlewares module3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and5responses, typically used for adding custom headers, retrying requests, and handling exceptions.6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information8on creating and utilizing them, refer to the official documentation:9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html10https://docs.scrapy.org/en/latest/topics/spider-middleware.html11"""12
13from __future__ import annotations14from typing import Generator, Iterable15
16from scrapy import Request, Spider, signals17from scrapy.crawler import Crawler18from scrapy.http import Response19
20# useful for handling different item types with a single interface21from itemadapter import is_item, ItemAdapter22
23
24class TitleSpiderMiddleware:25 # Not all methods need to be defined. If a method is not defined,26 # scrapy acts as if the spider middleware does not modify the27 # passed objects.28
29 @classmethod30 def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:31 # This method is used by Scrapy to create your spiders.32 s = cls()33 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)34 return s35
36 def process_spider_input(self, response: Response, spider: Spider) -> None:37 # Called for each response that goes through the spider38 # middleware and into the spider.39
40 # Should return None or raise an exception.41 return None42
43 def process_spider_output(44 self,45 response: Response,46 result: Iterable,47 spider: Spider,48 ) -> Generator[Iterable[Request] | None, None, None]:49 # Called with the results returned from the Spider, after50 # it has processed the response.51
52 # Must return an iterable of Request, or item objects.53 for i in result:54 yield i55
56 def process_spider_exception(57 self,58 response: Response,59 exception: BaseException,60 spider: Spider,61 ) -> Iterable[Request] | None:62 # Called when a spider or process_spider_input() method63 # (from other spider middleware) raises an exception.64
65 # Should return either None or an iterable of Request or item objects.66 pass67
68 def process_start_requests(69 self, start_requests: Iterable[Request], spider: Spider70 ) -> Iterable[Request]: # Called with the start requests of the spider, and works71 # similarly to the process_spider_output() method, except72 # that it doesn’t have a response associated.73
74 # Must return only requests (not items).75 for r in start_requests:76 yield r77
78 def spider_opened(self, spider: Spider) -> None:79 pass80
81
82class TitleDownloaderMiddleware:83 # Not all methods need to be defined. If a method is not defined,84 # scrapy acts as if the downloader middleware does not modify the85 # passed objects.86
87 @classmethod88 def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:89 # This method is used by Scrapy to create your spiders.90 s = cls()91 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)92 return s93
94 def process_request(self, request: Request, spider: Spider) -> Request | Response | None:95 # Called for each request that goes through the downloader96 # middleware.97
98 # Must either:99 # - return None: continue processing this request100 # - or return a Response object101 # - or return a Request object102 # - or raise IgnoreRequest: process_exception() methods of103 # installed downloader middleware will be called104 return None105
106 def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:107 # Called with the response returned from the downloader.108
109 # Must either;110 # - return a Response object111 # - return a Request object112 # - or raise IgnoreRequest113 return response114
115 def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:116 # Called when a download handler or a process_request()117 # (from other downloader middleware) raises an exception.118
119 # Must either:120 # - return None: continue processing this exception121 # - return a Response object: stops process_exception() chain122 # - return a Request object: stops process_exception() chain123 pass124
125 def spider_opened(self, spider: Spider) -> None:126 pass
src/news_functions.py
1import datetime2import re3
4months = {5 '01':'01','1':'1',6 '02':'02','2':'2',7 '03':'03','3':'3',8 '04':'04','4':'4',9 '05':'05','5':'5',10 '06':'06','6':'6',11 '07':'07','7':'7',12 '08':'08','8':'8',13 '09':'09','9':'9',14 '10':'10',15 '11':'11',16 '12':'12',17 '13':'13',18 '14':'14',19 '15':'15',20 '16':'16',21 '17':'17',22 '18':'18',23 '19':'19',24 '20':'20',25 '21':'21',26 '22':'22',27 '23':'23',28 '24':'24',29 '25':'25',30 '26':'26',31 '27':'27',32 '28':'28',33 '29':'29',34 '30':'30',35 '31':'31',36 'jan':'01','januari':'01','january':'01','janvier':'01','જાન્યુઆરી':'01','जनवरी':'01','जानेवारी':'01','jany':'01', 'জানুয়ারি' : '01',37 'feb':'02','februari':'02','february':'02','février':'02','ફેબ્રુઆરી':'02','फरवरी':'02','फेब्रुवारी':'02','ഫെബ്രുവരി':'02', 'ফেব্রুয়ারি' : '02',38 'mar':'03','march':'03','mars':'03','માર્ચ':'03','मार्च':'03','maart':'03','মার্চ':'03',39 'apr':'04','april':'04','avril':'04','એપ્રિલ':'04','अप्रैल':'04','एप्रिल':'04','এপ্রিল':'04',40 'may':'05','mei':'05','may':'05','mai':'05','મે':'05','मई':'05','मे':'05','মে':'05', 'മെയ്' : '05',41 'jun':'06','june':'06','juin':'06','juni':'06','જૂન':'06','जून':'06','জুন':'06', 42 'jul':'07','july':'07','juillet':'07','જુલાઈ':'07','जुलाई':'07','जुलै':'07','জুলাই':'07', 'juli' : '07', 'jyuly' : '07', 'ജൂലൈ' : '07', 43 'aug':'08','august':'08','août':'08','ઓગસ્ટ':'08','अगस्त':'08','ऑगस्ट':'08', 'augustus' : '08', 'আগস্ট' : '08',44 'sep':'09','sept':'09','september':'09','septembre':'09','સપ્ટેમ્બર':'09','सितम्बर':'09','सप्टेंबर':'09','সেপ্টেম্বর':'09',45 'oct':'10','oktober':'10','october':'10','octobre':'10','ઓક્ટોબર':'10','अक्टूबर':'10','ऑक्टोबर':'10', 'অক্টোবর' : '10',46 'nov':'11','november':'11','novembre':'11','નવેમ્બર':'11','नवम्बर':'11','नोव्हेंबर':'11','नवंबर':'11','নভেম্বর':'11',47 'dec':'12','december':'12','décembre':'12','ડિસેમ્બર':'12','दिसंबर':'12','डिसेंबर':'12','ডিসেম্বর':'12'48 }49months_pattern = r'[\u0900-\u097F]+|[\u0A80-\u0AFF]+|[\u0C00-\u0C7F]+|[\u0D00-\u0D7F]+|[\u0B80-\u0BFF]+|[\u0A80-\u0AFF]+|[\u0A00-\u0A7F]+|[\u0B00-\u0B7F]+|[\u0980-\u09FF]+|[\u0600-\u06FF]+|[A-Za-zéû]+'50url_pattern = r'[\w\:\/\.\?\[\]\=\&\;\-\%]+'51this_month = datetime.datetime.now()52this_hour = datetime.datetime(this_month.year,this_month.month,this_month.day,this_month.hour)53this_day = datetime.datetime(this_month.year,this_month.month,this_month.day)54this_month = datetime.datetime(this_month.year,this_month.month,1)55this_month = this_month - datetime.timedelta(days=1)56this_month = datetime.datetime(this_month.year,this_month.month,1)57
58def convert_css_selector(text):59 ''' This function converts selenium selectors for scrapy / bs460 61 Parameters62 --------63 text: str64 String Selenium Selector65 66 Returns67 --------68 tuple69 Selector tag with class/id and its value. For eg. (div,class,entry-content)70 '''71 if '//' in text:72 if '/' in text.replace('//',''):73 return text74 return list(re.findall('\/\/(\w+)\[\@(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])75 if '[' in text:76 return list(re.findall('(\w+)\[(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])77 return [text,'','']78
79def get_text_from_div(content):80 ''' Extracts plain text from html tags81 82 Parameters83 --------84 content: str85 Input string86 87 Returns88 --------89 Cleaned string with plain text90 '''91 content = content.replace('<br>','\n')92 temp_content = ''93 flag = 194 for i in range(len(content)):95 if content[i] == '<':96 if len(temp_content) and temp_content[-1] == '.':97 temp_content = temp_content+' '98 flag = 099 temp_content = temp_content+' '100 continue101 if content[i] == '>' and content[:i].split('<')[-1].startswith('script') == False and content[:i].split('<')[-1].startswith('style') == False:102 flag = 1103 continue104 if flag:105 temp_content = temp_content+content[i]106 while ' ' in temp_content:107 temp_content = temp_content.replace(' ',' ')108 return temp_content109
110def get_website(link):111 ''' Get website name from link112 For e.g. https://www.theaustralian.com.au/news/ is changed to theaustralian.com.au113 https://www.youtube.com/watch?v=..... is changed to youtube.com114 115 Parameters116 --------117 link: str118 URL text119 120 Returns121 --------122 str123 website name124 '''125 pattern = '[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]+\.[a-z]+'126 website = re.findall(pattern,link)[0].replace('www.','').replace('www1.','').split('.')127 if website[-2] in ['ac','blogspot','co','com','go','indiatimes','nic','net','org','gov']:128 website = '.'.join(website[-3:])129 else:130 website = '.'.join(website[-2:])131 return website132
133def to_datetime(uploaddate):134 ''' Date string is converted to datetime.datetime135 136 Parameters137 --------138 uploaddate: str139 Date string140 141 Returns142 --------143 datetime.datetime144 '''145 pattern = '(\d{4})-(\d{2})-(\d{2})'146 uploaddate2 = re.findall(pattern,uploaddate)147 if len(uploaddate2):148 uploaddate2 = uploaddate2[0]149 return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))150 pattern = '(\d{2})-(\d{2})-(\d{2})'151 try:152 uploaddate2 = re.findall(pattern,uploaddate)[0]153 return datetime.datetime(int('20'+uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))154 except IndexError:155 pass156 pattern = '(\d{4})\/(\d{1,2})\/(\d{1,2})'157 try:158 uploaddate2 = re.findall(pattern,uploaddate)[0]159 return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))160 except IndexError:161 pass162 pattern = '(\d{1,2})\/(\d{1,2})\/(\d{4})'163 try:164 uploaddate2 = re.findall(pattern,uploaddate)[0]165 return datetime.datetime(int(uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))166 except IndexError:167 pass168 return None169
170def get_single_field(mode,obj,f,mongo_dict,field):171 ''' This function gets field using field element information172
173 Parameters174 --------175 mode: str176 Mode - Scrapy or Selenium177 obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver178 response of webpage179 f: dict180 website attributes element181 mongo_dict: dict182 news record183 field: str184 field to extracted185 '''186 if 'index' in f:187 index = f['index']188 else:189 index = 0190 if mode == 'Scrapy':191 f['text'] = convert_selenium_to_scrapy(f['text'])192 if f['text'] == 'url':193 match mode:194 case 'Scrapy':195 mongo_dict[field] = process_field(mode,obj.url,field,f)196 case 'Selenium':197 mongo_dict[field] = process_field(mode,obj.current_url,field,f)198 elif '//' in f['text'][:2]:199 if 'next' in f:200 if mode == 'Scrapy':201 f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])202 if '//' in f['next']['text'][:2]:203 match mode:204 case 'Scrapy':205 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].xpath(f['next']['text'])[0],field,f)206 case 'Selenium':207 mongo_client[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)208 else:209 match mode:210 case 'Scrapy':211 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].css(f['next']['text'])[0],field,f)212 case 'Selenium':213 mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)214 else:215 match mode:216 case 'Scrapy':217 mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index],field,f)218 case 'Selenium':219 mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index],field,f)220 else:221 if 'next' in f:222 if mode == 'Scrapy':223 f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])224 if '//' in f['next']['text'][:2]:225 match mode:226 case 'Scrapy':227 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].xpath(f['next']['text'])[0],field,f)228 case 'Selenium':229 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)230 else:231 match mode:232 case 'Scrapy':233 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].css(f['next']['text'])[0],field,f)234 case 'Selenium':235 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)236 else:237 match mode:238 case 'Scrapy':239 mongo_dict[field] = process_field(mode,obj.css(f['text'])[index],field,f)240 case 'Selenium':241 mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index],field,f)242 if 'split' in f:243 mongo_dict[field] = mongo_dict[field].split(f['split'])[f['split_index']]244
245def get_field(mode,obj,x,mongo_dict,field):246 ''' This function gets field using field element information247
248 Parameters249 --------250 mode: str251 Mode - Scrapy or Selenium252 obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver253 response of webpage254 x: dict255 website attributes element256 mongo_dict: dict257 news record258 field: str259 field to extracted260 '''261 if type(x[field]) == list:262 for f in x[field]:263 try:264 get_single_field(mode,obj,f,mongo_dict,field)265 break266 except IndexError:267 pass268 else:269 get_single_field(mode,obj,x[field],mongo_dict,field)270 if field in mongo_dict and type(mongo_dict[field]) == str:271 mongo_dict[field] = mongo_dict[field].replace('\n',' ').strip()272
273def process_field(mode,element,field,f):274 ''' This function processes the text from an element that is extracted based on what field it is275 For e.g. for uploaddate, text will be converted to datetime.datetime,276 for content, all paragraphs will be extracted and joined and so on277
278 Parameters279 --------280 mode: str281 Mode - Scrapy or Selenium282 element: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver283 Scrapy element284 field: str285 field to be processed (title, subtitle,author, etc)286 x: dict287 website attributes element288
289 Returns290 --------291 str/int292 Processed field (title/subtitle/author, etc)293 '''294 print('reached',field)295 match field:296 case 'subtitle'|'author'|'title':297 if field == 'author' and type(element) == list:298 match mode:299 case 'Scrapy':300 return [get_text_from_div(e.extract()).strip() for e in element]301 case 'Selenium':302 return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element]303 match mode:304 case 'Scrapy':305 return get_text_from_div(element.extract())306 case 'Selenium':307 return get_text_from_div(element.get_attribute('innerHTML'))308 case 'uploaddate':309 match f['text']:310 case 'html':311 if 'today' in f:312 today = datetime.datetime.now()313 return datetime.datetime(today.year,today.month,today.day)314 match mode:315 case 'Scrapy':316 uploaddate = element.extract().lower()317 case 'Selenium':318 uploaddate = element.get_attribute('innerHTML').lower()319 if 'to_datetime' in f:320 return to_datetime(uploaddate.split(f['to_datetime'])[1].split('"')[2][:10])321 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')322 uploaddate = re.findall(pattern, uploaddate)[0]323 case 'url':324 if 're' in f:325 if 'order' not in f and 'to_datetime' in f:326 f['order'] = [0,1,2]327 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')328 uploaddate = re.findall(pattern, element)[0]329 else:330 return to_datetime(element)331 case _:332 if 'to_datetime' in f and f['to_datetime'] == '':333 match mode:334 case 'Scrapy':335 return to_datetime(element.extract())336 case 'Selenium':337 if 'meta[' in f['text']:338 return to_datetime(element.get_attribute('content'))339 if f['text'] == 'time':340 return to_datetime(element.get_attribute('datetime'))341 return to_datetime(element.get_attribute('innerHTML'))342 elif f['re'] == 'ago':343 match mode:344 case 'Scrapy':345 return process_uploaddate(element.extract())346 case 'Selenium':347 return process_uploaddate(element.get_attribute('innerHTML'))348 else:349 pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')350 match mode:351 case 'Scrapy':352 text = convert_numbers(get_text_from_div(element.extract()).lower())353 case 'Selenium':354 if 'meta[' in f['text']:355 text = convert_numbers(get_text_from_div(element.get_attribute('content')).lower())356 else:357 text = convert_numbers(get_text_from_div(element.get_attribute('innerHTML')).lower())358 uploaddate = re.findall(pattern,text)[0]359 date_list = [int(uploaddate[f['order'][i]]) if len(uploaddate[f['order'][i]]) == 4 and uploaddate[f['order'][i]].isnumeric() else int(months[uploaddate[f['order'][i]].lower()]) for i in range(len(f['order']))]360 if len(date_list) == 2:361 uploaddate = datetime.datetime(datetime.datetime.now().year,date_list[0],date_list[1])362 if uploaddate < datetime.datetime.now():363 return uploaddate364 return datetime.datetime(datetime.datetime.now().year-1,date_list[0],date_list[1])365 if 'add_2000' in f:366 date_list[0] = date_list[0]+2000367 return datetime.datetime(date_list[0],date_list[1],date_list[2])368 case 'image':369 return extract_image_and_caption(mode,element)370 case 'content':371 if 'p' not in f:372 f['p'] = True373 if f['p'] == True:374 if 'skip_p' in f:375 match mode:376 case 'Scrapy':377 return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if len(re.findall('|'.join(f['skip_p']),get_text_from_div(e.extract()).strip())) == 0 and (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False]))378 case 'Selenium':379 return get_text_from_div('\n'.join([e for e in element.find_element(By.CSS_SELECTOR,'p').get_attribute('innerHTML') if len(re.findall('|'.join(f['skip_p']),e)) == 0]))380 match mode:381 case 'Scrapy':382 return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False])).strip()383 case 'Selenium':384 return get_text_from_div('\n'.join([e.get_attribute('innerHTML') for e in element.find_elements(By.CSS_SELECTOR,'p') if len(e.find_elements(By.CSS_SELECTOR,'a')) == 0 or get_text_from_div(e.find_element(By.CSS_SELECTOR,'a').get_attribute('innerHTML')).strip() != get_text_from_div(e.get_attribute('innerHTML')).strip()]))385 match mode:386 case 'Scrapy':387 return get_text_from_div(element.extract())388 case 'Selenium':389 return get_text_from_div(element.get_attribute('innerHTML'))390 case 'views'|'comments'|'likes'|'dislikes':391 match mode:392 case 'Scrapy':393 count = get_text_from_div(element.extract().lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()394 case 'Selenium':395 count = get_text_from_div(get_text_from_div(element.get_attribute('innerHTML')).lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()396 if 'k' in count:397 multiply = 1000398 count = count.replace('k','').strip()399 elif 'm' in count:400 multiply = 1000000401 count = count.replace('m','').strip()402 else:403 multiply = 1404 count = float(count)*multiply405 return int(count)406 case 'tags':407 match mode:408 case 'Scrapy':409 return [get_text_from_div(e.extract()).strip().lower() for e in element.css('a')]410 case 'Selenium':411 return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element.find_elements(By.CSS_SELECTOR,'a')]412
413def extract_image_and_caption(mode,image):414 ''' This function extracts image and caption415
416 Parameters417 --------418 mode: str419 Mode - Scrapy or Selenium420 image: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver421 scrapy element422
423 Returns424 --------425 dict:426 Image url and caption dict427 '''428 image_dict = {}429 match mode:430 case 'Scrapy':431 image_dict['url'] = image.css('img')[0].xpath('@src')[0].extract()432 try:433 try:434 caption = image.css('img')[0].xpath('@alt')[0].extract()435 except IndexError:436 caption = image.css('img')[0].xpath('@title')[0].extract()437 except IndexError:438 caption = ''439 case 'Selenium':440 image_dict['url'] = image.find_element(By.CSS_SELECTOR,'img').get_attribute('src')441 try:442 try:443 caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')444 except IndexError:445 caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('title')446 except IndexError:447 pass448 if caption.strip() != '':449 image_dict['caption'] = get_text_from_div(caption)450 return image_dict
src/pipelines.py
1"""2Scrapy item pipelines module3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components5that handle the scraped items, typically used for cleaning, validating, and persisting data.6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:8http://doc.scrapy.org/en/latest/topics/item-pipeline.html9"""10
11from scrapy import Spider12
13from .items import TitleItem14
15
16class TitleItemPipeline:17 """18 This item pipeline defines processing steps for TitleItem objects scraped by spiders.19 """20
21 def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:22 # Do something with the item here, such as cleaning it or persisting it to a database23 return item
src/settings.py
1"""2Scrapy settings module3
4This module contains Scrapy settings for the project, defining various configurations and options.5
6For more comprehensive details on Scrapy settings, refer to the official documentation:7http://doc.scrapy.org/en/latest/topics/settings.html8"""9
10# You can update these options and add new ones11BOT_NAME = 'titlebot'12DEPTH_LIMIT = 113LOG_LEVEL = 'INFO'14NEWSPIDER_MODULE = 'src.spiders'15REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'16ROBOTSTXT_OBEY = True17SPIDER_MODULES = ['src.spiders']18ITEM_PIPELINES = {19 'src.pipelines.TitleItemPipeline': 123,20}21SPIDER_MIDDLEWARES = {22 'src.middlewares.TitleSpiderMiddleware': 543,23}24DOWNLOADER_MIDDLEWARES = {25 'src.middlewares.TitleDownloaderMiddleware': 543,26}
.dockerignore
# Git folder.git
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify[scrapy] ~= 1.7.05nest-asyncio ~= 1.5.86scrapy ~= 2.11.1
scrapy.cfg
[settings]default = src.settings
[deploy]project = src