news-scraper avatar

news-scraper

Deprecated
Go to Store
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
news-scraper

news-scraper

shreemayp/news-scraper

Input a list of news URLs to fetch news details - News Headline, Subtitle, Date, Content, etc.

.actor/Dockerfile

1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "my-actor",
4    "title": "Getting started with Python and Scrapy",
5    "description": "Scrapes titles of websites using Scrapy.",
6    "version": "0.0",
7    "meta": {
8        "templateId": "python-scrapy"
9    },
10    "input": "./input_schema.json",
11    "dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "Python Scrapy Scraper",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "startUrls": {
7            "title": "Start URLs",
8            "type": "array",
9            "description": "URLs to start with",
10            "prefill": [
11                { "url": "https://apify.com" }
12            ],
13            "editor": "requestListSources"
14        },
15        "proxyConfiguration": {
16            "sectionCaption": "Proxy and HTTP configuration",
17            "title": "Proxy configuration",
18            "type": "object",
19            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
20            "editor": "proxy",
21            "prefill": { "useApifyProxy": true },
22            "default": { "useApifyProxy": true }
23        }
24    },
25    "required": ["startUrls"]
26}

src/newspapers/aajtak.in.json

1{
2  "website": "aajtak.in",
3  "channel": "Aaj Tak",
4  "language": { "": "Hindi", "bangla": "Bangla" },
5  "author": { "text": "//div[@class=\"brand-detial-main\"]/h4" },
6  "content": {
7    "text": "div[class=\"story-with-main-sec\"]",
8    "p": true,
9    "skip_p": [ "ये भी पढ़ें" ]
10  },
11  "image": { "text": "div[class=\"content-area\"]" },
12  "subtitle": { "text": "h2[id=\"copy_true\"]" },
13  "title": { "text": "h1" },
14  "uploaddate": [
15    {
16      "text": "meta[property=\"article:published_time\"]",
17      "to_datetime": ""
18    },
19    {
20      "text": "div[class=\"brand-detial-main\"]",
21      "re": "(d{1,2}) (month) (d{4})",
22      "order": [ 2, 1, 0 ]
23    }
24  ]
25}

src/spiders/__init__.py

1"""
2Scrapy spiders package
3
4This package contains the spiders for your Scrapy project. Spiders are the classes that define how to scrape
5and process data from websites.
6
7For detailed information on creating and utilizing spiders, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/spiders.html
9"""

src/spiders/title.py

1from __future__ import annotations
2
3from typing import Generator
4from urllib.parse import urljoin
5from ..news_functions import *
6from scrapy import Request, Spider
7from scrapy.responsetypes import Response
8
9from ..items import TitleItem
10import json
11import os
12
13
14class TitleSpider(Spider):
15    """
16    Scrapes title pages and enqueues all links found on the page.
17    """
18
19    name = 'title_spider'
20
21    # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input
22    # when the project is executed using Apify.
23    start_urls = []
24
25    def parse(self, response: Response) -> Generator[TitleItem | Request, None, None]:
26        """
27        Parse the web page response.
28
29        Args:
30            response: The web page response.
31
32        Yields:
33            Yields scraped TitleItem and Requests for links.
34        """
35        self.logger.info('TitleSpider is parsing %s...', response)
36
37        # Extract and yield the TitleItem
38        url = response.url
39        website = get_website(url)
40        x = json.load(open('/usr/src/app/src/newspapers/'+website+'.json','r'))
41        mongo_dict = {"url": response.url}
42        for field in ['uploaddate','title','subtitle','author','content']:
43            if x and field in x:
44                mongo_dict[field] = x[field]
45                continue
46            if field not in x:
47                continue
48            try:
49                print('field',field)
50                get_field('Scrapy',response, x, mongo_dict,field)
51                if field in mongo_dict and mongo_dict[field] in ['', None]:
52                    del mongo_dict[field]
53            except (IndexError,ValueError,scrapy.exceptions.NotSupported):
54                pass
55        yield mongo_dict

src/__main__.py

1"""
2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
7`scrapy crawl title_spider`.
8
9We recommend you do not modify this file unless you really know what you are doing.
10"""
11
12# We need to configure the logging first before we import anything else, so that nothing else imports
13# `scrapy.utils.log` before we patch it.
14from __future__ import annotations
15from logging import StreamHandler, getLogger
16from typing import Any
17from scrapy.utils import log as scrapy_logging
18from scrapy.utils.project import get_project_settings
19from apify.log import ActorLogFormatter
20
21# Define names of the loggers.
22MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
23OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
24ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
28# a specific logger, do it in this file.
29settings = get_project_settings()
30LOGGING_LEVEL = settings['LOG_LEVEL']
31
32# Define a logging handler which will be used for the loggers.
33apify_handler = StreamHandler()
34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
38    """
39    Configure a logger with the specified settings.
40
41    Args:
42        logger_name: The name of the logger to be configured.
43        log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44        handlers: Optional list of logging handlers.
45    """
46    logger = getLogger(logger_name)
47    logger.setLevel(log_level)
48    logger.handlers = []
49
50    for handler in handlers:
51        logger.addHandler(handler)
52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
55# the `main.py` and Scrapy components.
56for logger_name in MAIN_LOGGER_NAMES:
57    configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
63# otherwise we would lose some log messages.
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68    """
69    We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
70    logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
71    loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
72    these four loggers and the root logger.
73    """
74    old_configure_logging(*args, **kwargs)
75
76    # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
77    # property within spiders. See details in the Spider logger property:
78    # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
79    configure_logger(None, LOGGING_LEVEL, apify_handler)
80
81    # We modify other loggers only by setting up their log level. A custom log handler is added
82    # only to the root logger to avoid duplicate log messages.
83    for logger_name in ALL_LOGGER_NAMES:
84        configure_logger(logger_name, LOGGING_LEVEL)
85
86    # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
87    # messages, especially when running on the platform.
88    configure_logger('httpx', 'WARNING')
89
90
91scrapy_logging.configure_logging = new_configure_logging
92
93# Now we can do the rest of the setup
94import asyncio
95import os
96import nest_asyncio
97from scrapy.utils.reactor import install_reactor
98from .main import main
99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
102# The reactor installation must be done manually before calling `nest_asyncio.apply()`,
103# otherwise, it will not work correctly on Windows.
104install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
105nest_asyncio.apply()
106
107# Specify the path to the Scrapy project settings module
108os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
109
110# Run the Apify main coroutine
111asyncio.run(main())

src/items.py

1"""
2Scrapy item models module
3
4This module defines Scrapy item models for scraped data. Items represent structured data
5extracted by spiders.
6
7For detailed information on creating and utilizing items, refer to the official documentation:
8https://docs.scrapy.org/en/latest/topics/items.html
9"""
10
11from scrapy import Field, Item
12import scrapy
13
14
15class TitleItem(Item):
16    """
17    Represents a title item scraped from a web page.
18    """
19
20    url = scrapy.Field()
21    header_link = scrapy.Field()
22    website = scrapy.Field()
23    date_added = scrapy.Field()
24    last_updated = scrapy.Field()
25    channel = scrapy.Field()
26    title = scrapy.Field()
27    subtitle = scrapy.Field()
28    image = scrapy.Field()
29    author = scrapy.Field()
30    uploaddate = scrapy.Field()
31    content = scrapy.Field()
32    views = scrapy.Field()
33    comments = scrapy.Field()
34    tags = scrapy.Field()
35    likes = scrapy.Field()
36    dislikes = scrapy.Field()
37    news_country = scrapy.Field()
38    news_region = scrapy.Field()
39    update = scrapy.Field()

src/main.py

1"""
2This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine
3processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by
4applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline
5for pushing data to the Apify dataset.
6
7Customization:
8--------------
9
10Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy
11components like spiders and handling Actor input. However, make sure you have a clear understanding of your
12modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify.
13
14Documentation:
15--------------
16
17For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and
18other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy.
19"""
20
21from __future__ import annotations
22
23from scrapy.crawler import CrawlerProcess
24
25from apify import Actor
26from apify.scrapy.utils import apply_apify_settings
27
28# Import your Scrapy spider here
29from .spiders.title import TitleSpider as Spider
30
31# Default input values for local execution using `apify run`
32LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
33
34
35async def main() -> None:
36    """
37    Apify Actor main coroutine for executing the Scrapy spider.
38    """
39    async with Actor:
40        Actor.log.info('Actor is being executed...')
41
42        # Process Actor input
43        actor_input = await Actor.get_input() or {}
44        start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
45        proxy_config = actor_input.get('proxyConfiguration')
46
47        # Add start URLs to the request queue
48        rq = await Actor.open_request_queue()
49        for start_url in start_urls:
50            url = start_url.get('url')
51            await rq.add_request(request={'url': url, 'method': 'GET'})
52
53        # Apply Apify settings, it will override the Scrapy project settings
54        settings = apply_apify_settings(proxy_config=proxy_config)
55
56        # Execute the spider using Scrapy CrawlerProcess
57        process = CrawlerProcess(settings, install_root_handler=False)
58        process.crawl(Spider)
59        process.start()

src/middlewares.py

1"""
2Scrapy middlewares module
3
4This module defines Scrapy middlewares. Middlewares are processing components that handle requests and
5responses, typically used for adding custom headers, retrying requests, and handling exceptions.
6
7There are 2 types of middlewares: spider middlewares and downloader middlewares. For detailed information
8on creating and utilizing them, refer to the official documentation:
9https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11"""
12
13from __future__ import annotations
14from typing import Generator, Iterable
15
16from scrapy import Request, Spider, signals
17from scrapy.crawler import Crawler
18from scrapy.http import Response
19
20# useful for handling different item types with a single interface
21from itemadapter import is_item, ItemAdapter
22
23
24class TitleSpiderMiddleware:
25    # Not all methods need to be defined. If a method is not defined,
26    # scrapy acts as if the spider middleware does not modify the
27    # passed objects.
28
29    @classmethod
30    def from_crawler(cls, crawler: Crawler) -> TitleSpiderMiddleware:
31        # This method is used by Scrapy to create your spiders.
32        s = cls()
33        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
34        return s
35
36    def process_spider_input(self, response: Response, spider: Spider) -> None:
37        # Called for each response that goes through the spider
38        # middleware and into the spider.
39
40        # Should return None or raise an exception.
41        return None
42
43    def process_spider_output(
44        self,
45        response: Response,
46        result: Iterable,
47        spider: Spider,
48    ) -> Generator[Iterable[Request] | None, None, None]:
49        # Called with the results returned from the Spider, after
50        # it has processed the response.
51
52        # Must return an iterable of Request, or item objects.
53        for i in result:
54            yield i
55
56    def process_spider_exception(
57        self,
58        response: Response,
59        exception: BaseException,
60        spider: Spider,
61    ) -> Iterable[Request] | None:
62        # Called when a spider or process_spider_input() method
63        # (from other spider middleware) raises an exception.
64
65        # Should return either None or an iterable of Request or item objects.
66        pass
67
68    def process_start_requests(
69        self, start_requests: Iterable[Request], spider: Spider
70    ) -> Iterable[Request]:  # Called with the start requests of the spider, and works
71        # similarly to the process_spider_output() method, except
72        # that it doesn’t have a response associated.
73
74        # Must return only requests (not items).
75        for r in start_requests:
76            yield r
77
78    def spider_opened(self, spider: Spider) -> None:
79        pass
80
81
82class TitleDownloaderMiddleware:
83    # Not all methods need to be defined. If a method is not defined,
84    # scrapy acts as if the downloader middleware does not modify the
85    # passed objects.
86
87    @classmethod
88    def from_crawler(cls, crawler: Crawler) -> TitleDownloaderMiddleware:
89        # This method is used by Scrapy to create your spiders.
90        s = cls()
91        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
92        return s
93
94    def process_request(self, request: Request, spider: Spider) -> Request | Response | None:
95        # Called for each request that goes through the downloader
96        # middleware.
97
98        # Must either:
99        # - return None: continue processing this request
100        # - or return a Response object
101        # - or return a Request object
102        # - or raise IgnoreRequest: process_exception() methods of
103        #   installed downloader middleware will be called
104        return None
105
106    def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
107        # Called with the response returned from the downloader.
108
109        # Must either;
110        # - return a Response object
111        # - return a Request object
112        # - or raise IgnoreRequest
113        return response
114
115    def process_exception(self, request: Request, exception: BaseException, spider: Spider) -> Response | None:
116        # Called when a download handler or a process_request()
117        # (from other downloader middleware) raises an exception.
118
119        # Must either:
120        # - return None: continue processing this exception
121        # - return a Response object: stops process_exception() chain
122        # - return a Request object: stops process_exception() chain
123        pass
124
125    def spider_opened(self, spider: Spider) -> None:
126        pass

src/news_functions.py

1import datetime
2import re
3
4months = {
5            '01':'01','1':'1',
6            '02':'02','2':'2',
7            '03':'03','3':'3',
8            '04':'04','4':'4',
9            '05':'05','5':'5',
10            '06':'06','6':'6',
11            '07':'07','7':'7',
12            '08':'08','8':'8',
13            '09':'09','9':'9',
14            '10':'10',
15            '11':'11',
16            '12':'12',
17            '13':'13',
18            '14':'14',
19            '15':'15',
20            '16':'16',
21            '17':'17',
22            '18':'18',
23            '19':'19',
24            '20':'20',
25            '21':'21',
26            '22':'22',
27            '23':'23',
28            '24':'24',
29            '25':'25',
30            '26':'26',
31            '27':'27',
32            '28':'28',
33            '29':'29',
34            '30':'30',
35            '31':'31',
36            'jan':'01','januari':'01','january':'01','janvier':'01','જાન્યુઆરી':'01','जनवरी':'01','जानेवारी':'01','jany':'01',  'জানুয়ারি' : '01',
37            'feb':'02','februari':'02','february':'02','février':'02','ફેબ્રુઆરી':'02','फरवरी':'02','फेब्रुवारी':'02','ഫെബ്രുവരി':'02',  'ফেব্রুয়ারি' : '02',
38            'mar':'03','march':'03','mars':'03','માર્ચ':'03','मार्च':'03','maart':'03','মার্চ':'03',
39            'apr':'04','april':'04','avril':'04','એપ્રિલ':'04','अप्रैल':'04','एप्रिल':'04','এপ্রিল':'04',
40            'may':'05','mei':'05','may':'05','mai':'05','મે':'05','मई':'05','मे':'05','মে':'05', 'മെയ്' : '05',
41            'jun':'06','june':'06','juin':'06','juni':'06','જૂન':'06','जून':'06','জুন':'06', 
42            'jul':'07','july':'07','juillet':'07','જુલાઈ':'07','जुलाई':'07','जुलै':'07','জুলাই':'07', 'juli' : '07', 'jyuly' : '07', 'ജൂലൈ' : '07', 
43            'aug':'08','august':'08','août':'08','ઓગસ્ટ':'08','अगस्त':'08','ऑगस्ट':'08', 'augustus' : '08', 'আগস্ট' : '08',
44            'sep':'09','sept':'09','september':'09','septembre':'09','સપ્ટેમ્બર':'09','सितम्बर':'09','सप्टेंबर':'09','সেপ্টেম্বর':'09',
45            'oct':'10','oktober':'10','october':'10','octobre':'10','ઓક્ટોબર':'10','अक्टूबर':'10','ऑक्टोबर':'10', 'অক্টোবর' : '10',
46            'nov':'11','november':'11','novembre':'11','નવેમ્બર':'11','नवम्बर':'11','नोव्हेंबर':'11','नवंबर':'11','নভেম্বর':'11',
47            'dec':'12','december':'12','décembre':'12','ડિસેમ્બર':'12','दिसंबर':'12','डिसेंबर':'12','ডিসেম্বর':'12'
48            }
49months_pattern = r'[\u0900-\u097F]+|[\u0A80-\u0AFF]+|[\u0C00-\u0C7F]+|[\u0D00-\u0D7F]+|[\u0B80-\u0BFF]+|[\u0A80-\u0AFF]+|[\u0A00-\u0A7F]+|[\u0B00-\u0B7F]+|[\u0980-\u09FF]+|[\u0600-\u06FF]+|[A-Za-zéû]+'
50url_pattern = r'[\w\:\/\.\?\[\]\=\&\;\-\%]+'
51this_month = datetime.datetime.now()
52this_hour = datetime.datetime(this_month.year,this_month.month,this_month.day,this_month.hour)
53this_day = datetime.datetime(this_month.year,this_month.month,this_month.day)
54this_month = datetime.datetime(this_month.year,this_month.month,1)
55this_month = this_month - datetime.timedelta(days=1)
56this_month = datetime.datetime(this_month.year,this_month.month,1)
57
58def convert_css_selector(text):
59    ''' This function converts selenium selectors for scrapy / bs4
60    
61    Parameters
62    --------
63    text: str
64        String Selenium Selector
65    
66    Returns
67    --------
68    tuple
69        Selector tag with class/id and its value. For eg. (div,class,entry-content)
70    '''
71    if '//' in text:
72        if '/' in text.replace('//',''):
73            return text
74        return list(re.findall('\/\/(\w+)\[\@(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])
75    if '[' in text:
76        return list(re.findall('(\w+)\[(\w+)\*?\=["\']([a-zA-Z0-9\:\-\_]+)["\']\]',text)[0])
77    return [text,'','']
78
79def get_text_from_div(content):
80    ''' Extracts plain text from html tags
81    
82    Parameters
83    --------
84    content: str
85        Input string
86    
87    Returns
88    --------
89        Cleaned string with plain text
90    '''
91    content = content.replace('<br>','\n')
92    temp_content = ''
93    flag = 1
94    for i in range(len(content)):
95        if content[i] == '<':
96            if len(temp_content) and temp_content[-1] == '.':
97                temp_content = temp_content+' '
98            flag = 0
99            temp_content = temp_content+' '
100            continue
101        if content[i] == '>' and content[:i].split('<')[-1].startswith('script') == False and content[:i].split('<')[-1].startswith('style') == False:
102            flag = 1
103            continue
104        if flag:
105            temp_content = temp_content+content[i]
106    while '  ' in temp_content:
107        temp_content = temp_content.replace('  ',' ')
108    return temp_content
109
110def get_website(link):
111    ''' Get website name from link
112        For e.g. https://www.theaustralian.com.au/news/ is changed to theaustralian.com.au
113                https://www.youtube.com/watch?v=..... is changed to youtube.com
114    
115    Parameters
116    --------
117    link: str
118        URL text
119    
120    Returns
121    --------
122    str
123        website name
124    '''
125    pattern = '[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]*\.*[a-z0-9\-]+\.[a-z]+'
126    website = re.findall(pattern,link)[0].replace('www.','').replace('www1.','').split('.')
127    if website[-2] in ['ac','blogspot','co','com','go','indiatimes','nic','net','org','gov']:
128        website = '.'.join(website[-3:])
129    else:
130        website = '.'.join(website[-2:])
131    return website
132
133def to_datetime(uploaddate):
134    ''' Date string is converted to datetime.datetime
135    
136    Parameters
137    --------
138    uploaddate: str
139        Date string
140    
141    Returns
142    --------
143    datetime.datetime
144    '''
145    pattern = '(\d{4})-(\d{2})-(\d{2})'
146    uploaddate2 = re.findall(pattern,uploaddate)
147    if len(uploaddate2):
148        uploaddate2 = uploaddate2[0]
149        return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))
150    pattern = '(\d{2})-(\d{2})-(\d{2})'
151    try:
152        uploaddate2 = re.findall(pattern,uploaddate)[0]
153        return datetime.datetime(int('20'+uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))
154    except IndexError:
155        pass
156    pattern = '(\d{4})\/(\d{1,2})\/(\d{1,2})'
157    try:
158        uploaddate2 = re.findall(pattern,uploaddate)[0]
159        return datetime.datetime(int(uploaddate2[0]),int(uploaddate2[1]),int(uploaddate2[2]))
160    except IndexError:
161        pass
162    pattern = '(\d{1,2})\/(\d{1,2})\/(\d{4})'
163    try:
164        uploaddate2 = re.findall(pattern,uploaddate)[0]
165        return datetime.datetime(int(uploaddate2[2]),int(uploaddate2[1]),int(uploaddate2[0]))
166    except IndexError:
167        pass
168    return None
169
170def get_single_field(mode,obj,f,mongo_dict,field):
171    ''' This function gets field using field element information
172
173    Parameters
174    --------
175    mode: str
176        Mode - Scrapy or Selenium
177    obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
178        response of webpage
179    f: dict
180        website attributes element
181    mongo_dict: dict
182        news record
183    field: str
184        field to extracted
185    '''
186    if 'index' in f:
187        index = f['index']
188    else:
189        index = 0
190    if mode == 'Scrapy':
191        f['text'] = convert_selenium_to_scrapy(f['text'])
192    if f['text'] == 'url':
193        match mode:
194            case 'Scrapy':
195                mongo_dict[field] = process_field(mode,obj.url,field,f)
196            case 'Selenium':
197                mongo_dict[field] = process_field(mode,obj.current_url,field,f)
198    elif '//' in f['text'][:2]:
199        if 'next' in f:
200            if mode == 'Scrapy':
201                f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])
202            if '//' in f['next']['text'][:2]:
203                match mode:
204                    case 'Scrapy':
205                        mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].xpath(f['next']['text'])[0],field,f)
206                    case 'Selenium':
207                        mongo_client[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)
208            else:
209                match mode:
210                    case 'Scrapy':
211                        mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index].css(f['next']['text'])[0],field,f)
212                    case 'Selenium':
213                        mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)
214        else:
215            match mode:
216                case 'Scrapy':
217                    mongo_dict[field] = process_field(mode,obj.xpath(f['text'])[index],field,f)
218                case 'Selenium':
219                    mongo_dict[field] = process_field(mode,obj.find_elements(By.XPATH,f['text'])[index],field,f)
220    else:
221        if 'next' in f:
222            if mode == 'Scrapy':
223                f['next']['text'] = convert_selenium_to_scrapy(f['next']['text'])
224            if '//' in f['next']['text'][:2]:
225                match mode:
226                    case 'Scrapy':
227                        mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].xpath(f['next']['text'])[0],field,f)
228                    case 'Selenium':
229                        mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.XPATH,f['next']['text']),field,f)
230            else:
231                match mode:
232                    case 'Scrapy':
233                        mongo_dict[field] = process_field(mode,obj.css(f['text'])[index].css(f['next']['text'])[0],field,f)
234                    case 'Selenium':
235                        mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index].find_element(By.CSS_SELECTOR,f['next']['text']),field,f)
236        else:
237            match mode:
238                case 'Scrapy':
239                    mongo_dict[field] = process_field(mode,obj.css(f['text'])[index],field,f)
240                case 'Selenium':
241                    mongo_dict[field] = process_field(mode,obj.find_elements(By.CSS_SELECTOR,f['text'])[index],field,f)
242    if 'split' in f:
243        mongo_dict[field] = mongo_dict[field].split(f['split'])[f['split_index']]
244
245def get_field(mode,obj,x,mongo_dict,field):
246    ''' This function gets field using field element information
247
248    Parameters
249    --------
250    mode: str
251        Mode - Scrapy or Selenium
252    obj: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
253        response of webpage
254    x: dict
255        website attributes element
256    mongo_dict: dict
257        news record
258    field: str
259        field to extracted
260    '''
261    if type(x[field]) == list:
262        for f in x[field]:
263            try:
264                get_single_field(mode,obj,f,mongo_dict,field)
265                break
266            except IndexError:
267                pass
268    else:
269        get_single_field(mode,obj,x[field],mongo_dict,field)
270    if field in mongo_dict and type(mongo_dict[field]) == str:
271        mongo_dict[field] = mongo_dict[field].replace('\n',' ').strip()
272
273def process_field(mode,element,field,f):
274    ''' This function processes the text from an element that is extracted based on what field it is
275        For e.g. for uploaddate, text will be converted to datetime.datetime,
276                for content, all paragraphs will be extracted and joined and so on
277
278    Parameters
279    --------
280    mode: str
281        Mode - Scrapy or Selenium
282    element: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
283        Scrapy element
284    field: str
285        field to be processed (title, subtitle,author, etc)
286    x: dict
287        website attributes element
288
289    Returns
290    --------
291    str/int
292        Processed field (title/subtitle/author, etc)
293    '''
294    print('reached',field)
295    match field:
296        case 'subtitle'|'author'|'title':
297            if field == 'author' and type(element) == list:
298                match mode:
299                    case 'Scrapy':
300                        return [get_text_from_div(e.extract()).strip() for e in element]
301                    case 'Selenium':
302                        return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element]
303            match mode:
304                case 'Scrapy':
305                    return get_text_from_div(element.extract())
306                case 'Selenium':
307                    return get_text_from_div(element.get_attribute('innerHTML'))
308        case 'uploaddate':
309            match f['text']:
310                case 'html':
311                    if 'today' in f:
312                        today = datetime.datetime.now()
313                        return datetime.datetime(today.year,today.month,today.day)
314                    match mode:
315                        case 'Scrapy':
316                            uploaddate = element.extract().lower()
317                        case 'Selenium':
318                            uploaddate = element.get_attribute('innerHTML').lower()
319                    if 'to_datetime' in f:
320                        return to_datetime(uploaddate.split(f['to_datetime'])[1].split('"')[2][:10])
321                    pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
322                    uploaddate = re.findall(pattern, uploaddate)[0]
323                case 'url':
324                    if 're' in f:
325                        if 'order' not in f and 'to_datetime' in f:
326                            f['order'] = [0,1,2]
327                        pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
328                        uploaddate = re.findall(pattern, element)[0]
329                    else:
330                        return to_datetime(element)
331                case _:
332                    if 'to_datetime' in f and f['to_datetime'] == '':
333                        match mode:
334                            case 'Scrapy':
335                                return to_datetime(element.extract())
336                            case 'Selenium':
337                                if 'meta[' in f['text']:
338                                    return to_datetime(element.get_attribute('content'))
339                                if f['text'] == 'time':
340                                    return to_datetime(element.get_attribute('datetime'))
341                                return to_datetime(element.get_attribute('innerHTML'))
342                    elif f['re'] == 'ago':
343                        match mode:
344                            case 'Scrapy':
345                                return process_uploaddate(element.extract())
346                            case 'Selenium':
347                                return process_uploaddate(element.get_attribute('innerHTML'))
348                    else:
349                        pattern = f['re'].replace('month',months_pattern).replace('d{','\\d{')
350                        match mode:
351                            case 'Scrapy':
352                                text = convert_numbers(get_text_from_div(element.extract()).lower())
353                            case 'Selenium':
354                                if 'meta[' in f['text']:
355                                    text = convert_numbers(get_text_from_div(element.get_attribute('content')).lower())
356                                else:
357                                    text = convert_numbers(get_text_from_div(element.get_attribute('innerHTML')).lower())
358                        uploaddate = re.findall(pattern,text)[0]
359            date_list = [int(uploaddate[f['order'][i]]) if len(uploaddate[f['order'][i]]) == 4 and uploaddate[f['order'][i]].isnumeric() else int(months[uploaddate[f['order'][i]].lower()]) for i in range(len(f['order']))]
360            if len(date_list) == 2:
361                uploaddate = datetime.datetime(datetime.datetime.now().year,date_list[0],date_list[1])
362                if uploaddate < datetime.datetime.now():
363                    return uploaddate
364                return datetime.datetime(datetime.datetime.now().year-1,date_list[0],date_list[1])
365            if 'add_2000' in f:
366                date_list[0] = date_list[0]+2000
367            return datetime.datetime(date_list[0],date_list[1],date_list[2])
368        case 'image':
369            return extract_image_and_caption(mode,element)
370        case 'content':
371            if 'p' not in f:
372                f['p'] = True
373            if f['p'] == True:
374                if 'skip_p' in f:
375                    match mode:
376                        case 'Scrapy':
377                            return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if len(re.findall('|'.join(f['skip_p']),get_text_from_div(e.extract()).strip())) == 0 and (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False]))
378                        case 'Selenium':
379                            return get_text_from_div('\n'.join([e for e in element.find_element(By.CSS_SELECTOR,'p').get_attribute('innerHTML') if len(re.findall('|'.join(f['skip_p']),e)) == 0]))
380                match mode:
381                    case 'Scrapy':
382                        return get_text_from_div('\n'.join([e.extract() for e in element.css('p') if (len(e.css('a')) and get_text_from_div(e.extract().strip()) == get_text_from_div(e.css('a')[0].extract())) == False])).strip()
383                    case 'Selenium':
384                        return get_text_from_div('\n'.join([e.get_attribute('innerHTML') for e in element.find_elements(By.CSS_SELECTOR,'p') if len(e.find_elements(By.CSS_SELECTOR,'a')) == 0 or get_text_from_div(e.find_element(By.CSS_SELECTOR,'a').get_attribute('innerHTML')).strip() != get_text_from_div(e.get_attribute('innerHTML')).strip()]))
385            match mode:
386                case 'Scrapy':
387                    return get_text_from_div(element.extract())
388                case 'Selenium':
389                    return get_text_from_div(element.get_attribute('innerHTML'))
390        case 'views'|'comments'|'likes'|'dislikes':
391            match mode:
392                case 'Scrapy':
393                    count = get_text_from_div(element.extract().lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()
394                case 'Selenium':
395                    count = get_text_from_div(get_text_from_div(element.get_attribute('innerHTML')).lower().replace(',','').replace('like','').replace('comment','').replace('view','')).replace('s','').strip()
396            if 'k' in count:
397                multiply = 1000
398                count = count.replace('k','').strip()
399            elif 'm' in count:
400                multiply = 1000000
401                count = count.replace('m','').strip()
402            else:
403                multiply = 1
404            count = float(count)*multiply
405            return int(count)
406        case 'tags':
407            match mode:
408                case 'Scrapy':
409                    return [get_text_from_div(e.extract()).strip().lower() for e in element.css('a')]
410                case 'Selenium':
411                    return [get_text_from_div(e.get_attribute('innerHTML')).strip() for e in element.find_elements(By.CSS_SELECTOR,'a')]
412
413def extract_image_and_caption(mode,image):
414    ''' This function extracts image and caption
415
416    Parameters
417    --------
418    mode: str
419        Mode - Scrapy or Selenium
420    image: scrapy.http.response.html.HtmlResponse / selenium_driver.SeleniumDriver
421        scrapy element
422
423    Returns
424    --------
425    dict:
426        Image url and caption dict
427    '''
428    image_dict = {}
429    match mode:
430        case 'Scrapy':
431            image_dict['url'] = image.css('img')[0].xpath('@src')[0].extract()
432            try:
433                try:
434                    caption = image.css('img')[0].xpath('@alt')[0].extract()
435                except IndexError:
436                    caption = image.css('img')[0].xpath('@title')[0].extract()
437            except IndexError:
438                caption = ''
439        case 'Selenium':
440            image_dict['url'] = image.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
441            try:
442                try:
443                    caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
444                except IndexError:
445                    caption = image.find_element(By.CSS_SELECTOR,'img').get_attribute('title')
446            except IndexError:
447                pass
448    if caption.strip() != '':
449        image_dict['caption'] = get_text_from_div(caption)
450    return image_dict

src/pipelines.py

1"""
2Scrapy item pipelines module
3
4This module defines Scrapy item pipelines for scraped data. Item pipelines are processing components
5that handle the scraped items, typically used for cleaning, validating, and persisting data.
6
7For detailed information on creating and utilizing item pipelines, refer to the official documentation:
8http://doc.scrapy.org/en/latest/topics/item-pipeline.html
9"""
10
11from scrapy import Spider
12
13from .items import TitleItem
14
15
16class TitleItemPipeline:
17    """
18    This item pipeline defines processing steps for TitleItem objects scraped by spiders.
19    """
20
21    def process_item(self, item: TitleItem, spider: Spider) -> TitleItem:
22        # Do something with the item here, such as cleaning it or persisting it to a database
23        return item

src/settings.py

1"""
2Scrapy settings module
3
4This module contains Scrapy settings for the project, defining various configurations and options.
5
6For more comprehensive details on Scrapy settings, refer to the official documentation:
7http://doc.scrapy.org/en/latest/topics/settings.html
8"""
9
10# You can update these options and add new ones
11BOT_NAME = 'titlebot'
12DEPTH_LIMIT = 1
13LOG_LEVEL = 'INFO'
14NEWSPIDER_MODULE = 'src.spiders'
15REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
16ROBOTSTXT_OBEY = True
17SPIDER_MODULES = ['src.spiders']
18ITEM_PIPELINES = {
19    'src.pipelines.TitleItemPipeline': 123,
20}
21SPIDER_MIDDLEWARES = {
22    'src.middlewares.TitleSpiderMiddleware': 543,
23}
24DOWNLOADER_MIDDLEWARES = {
25    'src.middlewares.TitleDownloaderMiddleware': 543,
26}

.dockerignore

1# Git folder
2.git
3
4# IDE
5.idea/
6.vscode/
7.DS_Store
8
9# Apify storage folder
10storage/
11
12# Python virtual environment
13.venv/
14.env/
15
16# Python (and python tools) cache files
17__pycache__/
18*.pyc
19.ruff_cache/
20.mypy_cache/
21.pytest_cache/
22
23# Python build files
24__pypackages__/
25dist/
26build/
27*.egg-info/
28*.egg
29
30# log files
31*.log

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.gitignore

1# IDE
2.idea/
3.vscode/
4.DS_Store
5
6# Apify storage folder
7storage/
8
9# Python virtual environment
10.venv/
11.env/
12
13# Python (and python tools) cache files
14__pycache__/
15*.pyc
16.ruff_cache/
17.mypy_cache/
18.pytest_cache/
19
20# Python build files
21__pypackages__/
22dist/
23build/
24*.egg-info/
25*.egg
26
27# log files
28*.log

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify[scrapy] ~= 1.7.0
5nest-asyncio ~= 1.5.8
6scrapy ~= 2.11.1

scrapy.cfg

1[settings]
2default = src.settings
3
4[deploy]
5project = src
Developer
Maintained by Community