Investopedia Scraper
Try for free
No credit card required
Go to Store
Investopedia Scraper
glitch_404/investopedia-scraper
Try for free
No credit card required
Investopedia Scraper will enable you to get all the Articles you want from Investopedia.com.
.dockerignore
1# Git folder
2.git
3
4# IDE
5.idea/
6.vscode/
7.DS_Store
8
9# Apify storage folder
10storage/
11
12# Python virtual environment
13.venv/
14.env/
15
16# Python (and python tools) cache files
17__pycache__/
18*.pyc
19.ruff_cache/
20.mypy_cache/
21.pytest_cache/
22
23# Python build files
24__pypackages__/
25dist/
26build/
27*.egg-info/
28*.egg
29
30# log files
31*.log
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.gitignore
1# IDE
2.idea/
3.vscode/
4.DS_Store
5
6# Apify storage folder
7storage/
8
9# Python virtual environment
10.venv/
11.env/
12
13# Python (and python tools) cache files
14__pycache__/
15*.pyc
16.ruff_cache/
17.mypy_cache/
18.pytest_cache/
19
20# Python build files
21__pypackages__/
22dist/
23build/
24*.egg-info/
25*.egg
26
27# log files
28*.log
package.json
1{
2 "name": "InvestopediaScraper",
3 "version": "1.0.0",
4 "description": "Investopedia Scraper will enable you to get all the Articles you want from Investopedia.com since they have no API.",
5 "main": "src/main.py",
6 "scripts": {
7 "start": "python3 src/__main__.py",
8 "test": "echo \"Error: no test specified\" && exit 1"
9 },
10 "keywords": ["investopedia.com", "investopedia", "news scraper", "scraper"],
11 "author": "Glitch_404",
12 "engines": {
13 "node": ">=16.0.0",
14 "npm": ">=8.0.0"
15 },
16 "license": "ISC",
17 "dependencies": {
18 "apify": "^0.19.1",
19 "scrapy": "^2.11.0"
20 }
21}
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify[scrapy] ~= 1.5.3
5nest-asyncio ~= 1.5.8
6scrapy ~= 2.11.0
7itemadapter~=0.8.0
scrapy.cfg
1# Automatically created by: scrapy startproject
2#
3# For more information about the [deploy] section see:
4# https://scrapyd.readthedocs.io/en/latest/deploy.html
5
6[settings]
7default = src.settings
8shell = 'ipython'
9
10[deploy]
11#url = http://localhost:6800/
12project = src
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.11
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Specify how to launch the source code of your Actor.
32# By default, the "python3 -m src" command is run
33CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "Investopedia-Scraper",
4 "title": "Investopedia-Scraper",
5 "description": "Scrapes articles of investopedia.com using Scrapy",
6 "buildTag": "latest",
7 "version": "1.0",
8 "readme": "./readme.md",
9 "meta": {
10 "templateId": "python-scrapy"
11 },
12 "input": "./input_schema.json",
13 "dockerfile": "./Dockerfile",
14 "storages": {
15 "dataset": "./dataset_schema.json"
16 },
17 "minMemoryMbytes": 1024,
18 "maxMemoryMbytes": 4096
19}
.actor/dataset_schema.json
1{
2 "actorSpecification": 1,
3 "views": {
4 "overview": {
5 "title": "Overview",
6 "transformation": {
7 "fields": [
8 "image",
9 "title",
10 "category",
11 "author",
12 "reviewer",
13 "facts_checker",
14 "contributors",
15 "date",
16 "link",
17 "paragraph"
18 ]
19 },
20 "display": {
21 "component": "table",
22 "properties": {
23 "Image": {
24 "label": "Image",
25 "format": "image"
26 },
27 "Title": {
28 "label": "Text",
29 "format": "text"
30 },
31 "Category": {
32 "label": "Text",
33 "format": "Text"
34 },
35 "Author": {
36 "label": "Text",
37 "format": "text"
38 },
39 "Reviewer": {
40 "label": "Text",
41 "format": "Text"
42 },
43 "FactsChecker": {
44 "label": "FactsChecker",
45 "format": "Text"
46 },
47 "Contributors": {
48 "label": "Array",
49 "format": "array"
50 },
51 "Date": {
52 "label": "Date",
53 "format": "date"
54 },
55 "Link": {
56 "label": "Link",
57 "format": "link"
58 },
59 "Paragraph": {
60 "label": "Text",
61 "format": "Text"
62 }
63 }
64 }
65 }
66 }
67}
.actor/input_schema.json
1{
2 "title": "Python-Scrapy-InvestopediaScraper",
3 "type": "object",
4 "schemaVersion": 1.0,
5 "properties": {
6 "startUrls": {
7 "title": "Start URLs",
8 "type": "array",
9 "description": "URLs to start with",
10 "prefill": [
11 {"url": "https://www.investopedia.com/stocks-4427785"},
12 {"url": "https://www.investopedia.com/savings-accounts-4689728"},
13 {"url": "https://www.investopedia.com/personal-finance-4427760"},
14 {"url": "https://www.investopedia.com/markets-news-4427704"},
15 {"url": "https://www.investopedia.com/best-online-brokers-4587872"}
16 ],
17 "editor": "requestListSources"
18 },
19 "dateRange": {
20 "title": "Date Range",
21 "type": "string",
22 "description": "Scrape only in specific date range\nDate Can Only be is the formats\n\n* YYYY/M/D e.g. 2023/5/23 to get all the articles that have the same day\n\n* from YYYY/M/D to YYYY/M/D e.g. 2024/1/25 - 2024/1/30 to get a range of 5 days\n pay attention to the formate (spaces and characters) YYYY/M/D - YYYY/M/D\n\n* 25 days or 5 weeks or 3 months or 2 years\n only supported formates (day, week, month, year)\n this feature will always scrape from today's date till the date specified\ne.g. 2024/1/30 is today and you entered 5 days this will scrape from 2024/1/25 to 2024/1/30\n\n* if you don't care about the date and just want 100 aritcles, more or less just enter 'anytime' as a value e.g. anytime\n\n* Allowed Formats: 2023/1/25 - 2023/1/30 or 2020/5/26 or 3 days or anytime\n\n* Leave blank to scrape today's date articles",
23 "prefill": "anytime",
24 "editor": "textfield"
25 },
26 "maxArticles": {
27 "title": "Maximum Articles Amount",
28 "type": "integer",
29 "description": "Choose how many articles do you want",
30 "prefill": 100,
31 "default": 100,
32 "minimum": 1,
33 "maximum": 1000000
34 },
35 "proxyConfiguration": {
36 "sectionCaption": "Proxy and HTTP configuration",
37 "title": "Proxy configuration",
38 "type": "object",
39 "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
40 "editor": "proxy",
41 "prefill": { "useApifyProxy": true },
42 "default": { "useApifyProxy": true }
43 }
44 },
45 "required": ["startUrls", "maxArticles"]
46}
src/__init__.py
src/__main__.py
1"""
2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's
3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.
4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally
6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using
7`scrapy crawl title_spider`.
8
9We recommend you do not modify this file unless you really know what you are doing.
10"""
11
12# We need to configure the logging first before we import anything else, so that nothing else imports
13# `scrapy.utils.log` before we patch it.
14from __future__ import annotations
15from logging import StreamHandler, getLogger
16from typing import Any
17from scrapy.utils import log as scrapy_logging
18from scrapy.utils.project import get_project_settings
19from apify.log import ActorLogFormatter
20
21# Define names of the loggers.
22APIFY_LOGGER_NAMES = ['apify', 'apify_client']
23SCRAPY_LOGGER_NAMES = ['filelock', 'hpack', 'httpx', 'scrapy', 'twisted']
24ALL_LOGGER_NAMES = APIFY_LOGGER_NAMES + SCRAPY_LOGGER_NAMES
25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
28# a specific logger, do it in this file.
29settings = get_project_settings()
30LOGGING_LEVEL = settings['LOG_LEVEL']
31
32# Define a logging handler which will be used for the loggers.
33apify_handler = StreamHandler()
34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))
35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
38 """
39 Configure a logger with the specified settings.
40
41 Args:
42 logger_name: The name of the logger to be configured.
43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).
44 handlers: Optional list of logging handlers.
45 """
46 logger = getLogger(logger_name)
47 logger.setLevel(log_level)
48 logger.handlers = []
49
50 for handler in handlers:
51 logger.addHandler(handler)
52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
55# the `main.py` and Scrapy components.
56for logger_name in APIFY_LOGGER_NAMES:
57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though
61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method
62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because
63# otherwise we would lose some log messages.
64old_configure_logging = scrapy_logging.configure_logging
65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:
68 """
69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
72 these four loggers and the root logger.
73 """
74 old_configure_logging(*args, **kwargs)
75
76 # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`
77 # property within spiders. See details in the Spider logger property:
78 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.
79 configure_logger(None, LOGGING_LEVEL, apify_handler)
80
81 # We modify other loggers only by setting up their log level. A custom log handler is added
82 # only to the root logger to avoid duplicate log messages.
83 for logger_name in ALL_LOGGER_NAMES:
84 configure_logger(logger_name, LOGGING_LEVEL)
85
86 # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless
87 # messages, especially when running on the platform.
88 configure_logger('httpx', 'WARNING')
89
90
91scrapy_logging.configure_logging = new_configure_logging
92
93# Now we can do the rest of the setup
94import asyncio
95import os
96import nest_asyncio
97from scrapy.utils.reactor import install_reactor
98from src.main import main
99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
102install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
103nest_asyncio.apply()
104
105# Specify the path to the Scrapy project settings module
106os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
107
108# Run the Apify main coroutine
109asyncio.run(main())
src/items.py
1import scrapy
2
3
4class InvestopediaScraperItem(scrapy.Item):
5 image = scrapy.Field(default=None, serializer=str)
6 title = scrapy.Field(default=False, required=True, serializer=str)
7 category = scrapy.Field(default=None, serializer=str)
8 author = scrapy.Field(default=None, serializer=str)
9 reviewer = scrapy.Field(default=None, serializer=str)
10 facts_checker = scrapy.Field(default=None, serializer=str)
11 contributors = scrapy.Field(default=None, serializer=str)
12 date = scrapy.Field(default=None, serializer=str)
13 paragraph = scrapy.Field(required=True, serializer=str)
14 link = scrapy.Field(required=True, serializer=str)
src/main.py
1from __future__ import annotations
2
3from scrapy.crawler import CrawlerProcess
4from scrapy.utils.project import get_project_settings
5from scrapy.settings import Settings
6from apify import Actor
7
8# Import your Scrapy spider here
9from src.spiders.Investopedia import InvestopediaSpider as Spider
10
11# Default input values for local execution using `apify run`
12LOCAL_DEFAULT_START_URLS = [
13 {'url': "https://www.investopedia.com/stocks-4427785"},
14 {'url': "https://www.investopedia.com/savings-accounts-4689728"},
15 {'url': "https://www.investopedia.com/personal-finance-4427760"},
16 {'url': "https://www.investopedia.com/markets-news-4427704"},
17 {'url': "https://www.investopedia.com/best-online-brokers-4587872"},
18]
19
20
21def __custom_settings(start_urls, max_articles, date_range, proxy_config) -> Settings:
22 settings = get_project_settings()
23
24 # Use ApifyScheduler as the scheduler
25 # settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler' # cases bugs
26
27 # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
28 # ensuring it is executed as the final step in the pipeline sequence
29 settings['ITEM_PIPELINES']['src.pipelines.ActorPushPipeline'] = 1000
30
31 # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
32 settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
33
34 # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
35 settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
36 settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950
37
38 # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000) # cases bugs
39 # settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
40 # settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
41
42 # Store the proxy configuration
43 settings['APIFY_PROXY_SETTINGS'] = proxy_config
44
45 # Applying Custom Inputs to settings
46 settings['ALLOWED_DATE_RANGE'] = date_range
47 settings['CLOSESPIDER_ITEMCOUNT'] = max_articles
48 settings['START_URLS'] = start_urls
49
50 return settings
51
52
53async def main() -> None:
54 """
55 Apify Actor main coroutine for executing the Scrapy spider.
56 """
57 async with Actor:
58 Actor.log.info('Actor is being executed...')
59
60 # Process Actor input
61 actor_input = await Actor.get_input() or {}
62
63 start_urls = [url.get('url') for url in actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)]
64 date_range = actor_input.get('dateRange', None)
65 max_articles = actor_input.get('maxArticles', 100)
66 proxy_config = actor_input.get('proxyConfiguration')
67
68 Actor.log.info(f'\nstartUrls: {start_urls}\nmaxArticles: {max_articles}\ndateRange: {date_range}\n')
69
70 # Apply Apify settings, it will override the Scrapy project settings
71 settings = __custom_settings(start_urls, max_articles, date_range, proxy_config)
72
73 # Execute the spider using Scrapy CrawlerProcess
74 Actor.log.info('crawling started')
75
76 process = CrawlerProcess(settings)
77 process.crawl(Spider)
78 process.start()
src/middlewares.py
1import random
2from src.user_agent import user_agents
3
4
5class InvestopediaFakeHeadersMiddleware:
6 def process_request(self, request, spider):
7 request.headers['Accept'] = {"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}
8 request.headers['Accept-Encoding'] = {"Accept-Encoding": 'gzip, deflate, br'}
9 request.headers['Sec-Ch-Ua-Platform'] = {"Sec-Ch-Ua-Platform": random.choice(['Windows', 'Linux', 'MacOS'])}
10 request.headers['User-Agent'] = {"User-Agent": random.choice(user_agents)}
src/pipelines.py
1import re
2import math
3import datetime
4import apify.actor
5import apify.scrapy.pipelines
6
7from scrapy import Item, Spider
8from itemadapter import ItemAdapter
9from scrapy.exceptions import DropItem, CloseSpider
10
11
12class ActorPushPipeline(apify.scrapy.pipelines.ActorDatasetPushPipeline):
13 """
14 A Scrapy pipeline for pushing items to an Actor's default dataset.
15 This pipeline is designed to be enabled only when the Scrapy project is run as an Actor.
16 """
17
18 def __init__(self, item_count):
19 self.item_count = item_count
20 self.counter = 0
21
22 @classmethod
23 def from_crawler(cls, crawler):
24 item_count = crawler.settings.get('CLOSESPIDER_ITEMCOUNT')
25 return cls(int(item_count))
26
27 async def process_item(self, item: Item, spider: Spider) -> Item:
28 """Pushes the provided Scrapy item to the Actor's default dataset."""
29 self.counter += 1
30 if self.counter <= self.item_count:
31 item_dict = ItemAdapter(item).asdict()
32 await apify.actor.Actor.push_data(item_dict)
33 spider.logger.info(f'item #{item["_id"]} pushed to the dataset.')
34
35
36class InvestopediaSpiderPipeline:
37 def __init__(self, allowed_date_range):
38 self.today = datetime.datetime.today().date()
39 self.months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
40 self.allowed_date_range = allowed_date_range
41 self.drops_counter = 0
42
43 @classmethod
44 def from_crawler(cls, crawler):
45 allowed_date_range = crawler.settings.get('ALLOWED_DATE_RANGE', None)
46 return cls(allowed_date_range)
47
48 def __is_allowed_date(self, item, article_date, allowed_date_range):
49 if '-' not in allowed_date_range and '/' in allowed_date_range:
50 if (date := datetime.date(*list(map(int, allowed_date_range.split('/'))))) != article_date:
51 raise DropItem(f"dropped item #{item} date {article_date} not in requested date range {date}")
52
53 elif '-' in allowed_date_range and '/' in allowed_date_range:
54 date = allowed_date_range.split('-')
55 start_date = datetime.date(*list(map(int, date[0].split('/'))))
56 end_date = datetime.date(*list(map(int, date[1].split('/'))))
57
58 if start_date < article_date < end_date:
59 raise DropItem(f"dropped item #{item} article date {article_date} not in requested date range")
60
61 elif any([word in allowed_date_range for word in ['day', 'week', 'year', 'month']]):
62 if 'day' in allowed_date_range:
63 days_amount = int(allowed_date_range.replace('day', ''))
64 elif 'week' in allowed_date_range:
65 days_amount = math.floor(float(allowed_date_range.replace('week', '')) * 7)
66 elif 'month' in allowed_date_range:
67 days_amount = math.floor(float(allowed_date_range.replace('month', '')) * 30)
68 elif 'year' in allowed_date_range:
69 days_amount = math.floor(float(allowed_date_range.replace('year', '')) * 365.25)
70 else:
71 raise CloseSpider(f"{allowed_date_range} date requested is malformed can't be processed")
72
73 if self.today - datetime.timedelta(days=days_amount) > article_date:
74 raise DropItem(f"dropped item #{item} article date {article_date} not in requested date range")
75 else:
76 raise CloseSpider(f"{allowed_date_range} date requested is malformed can't be processed")
77
78 return True
79
80 def process_item(self, item: Item, spider: Spider) -> Item:
81 spider.logger.info(f'Processing item #{item}')
82
83 if self.drops_counter >= 50:
84 raise CloseSpider(f"[DropLimit] Drop limit reached: {self.drops_counter} drops")
85
86 article_date = item['date'].replace(',', '').lower().split(' ')[1:]
87 article_date = datetime.date(int(article_date[2]), self.months.index(article_date[0]) + 1, int(article_date[1]))
88 try:
89 if self.allowed_date_range:
90 self.allowed_date_range = self.allowed_date_range.lower().replace(" ", "").replace('s', '')
91 if 'anytime' not in self.allowed_date_range:
92 self.__is_allowed_date(item, article_date, self.allowed_date_range)
93 else:
94 if article_date != self.today:
95 raise DropItem(f"item {item} not from today's date")
96 except DropItem as e:
97 self.drops_counter += 1
98 raise DropItem(f"item #{item} dropped due to ERROR: {e}")
99 else:
100 self.drops_counter = 0
101
102 item['contributors'] = [f"{contributor}" for contributor in item['contributors'] if 'http' in contributor] or None
103 item['date'] = article_date
104 item['title'] = item['title'].strip()
105 item['category'] = ','.join(item['category'])
106 item['paragraph'] = re.sub(r"\n{2,}", "\n", ''.join(item['paragraph']).strip())
107
108 return item
src/settings.py
1BOT_NAME = "InvestopediaCrawler"
2SPIDER_MODULES = ["src.spiders"]
3NEWSPIDER_MODULE = "src.spiders"
4
5CLOSESPIDER_ITEMCOUNT = 100
6ALLOWED_DATE_RANGE = None
7START_URLS = ["https://www.investopedia.com/investing-in-the-uk-7548232"]
8
9DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
10ROBOTSTXT_OBEY = True
11COOKIES_ENABLED = True
12
13# LOG_FILE = 'scrapy.log'
14# LOG_FILE_APPEND = False
15LOG_LEVEL = 'INFO'
16
17DOWNLOADER_MIDDLEWARES = {
18 "src.middlewares.InvestopediaFakeHeadersMiddleware": 300,
19}
20ITEM_PIPELINES = {
21 'src.pipelines.InvestopediaSpiderPipeline': 100
22}
23
24REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
25TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
26FEED_EXPORT_ENCODING = "utf-8"
src/user_agent.py
1user_agents = [
2 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
3 "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
4 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
5 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",
6 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",
7 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
8 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
9 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240",
10 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36",
11 "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
12 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
13 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
14 "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
15 "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
16 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12",
17 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
18 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0",
19 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/7.1.8 Safari/537.85.17",
20 "Mozilla/5.0 (iPad; CPU OS 8_4 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H143 Safari/600.1.4",
21 "Mozilla/5.0 (iPad; CPU OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F69 Safari/600.1.4",
22 "Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0",
23 "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
24 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
25 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
26 "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0",
27 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
28 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/8.0.6 Safari/600.6.3",
29 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17",
30 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
32 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",
33 "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
34 "Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
35 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
36 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
37 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
38 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:40.0) Gecko/20100101 Firefox/40.0",
39 "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
40 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
41 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
42 "Mozilla/5.0 (X11; CrOS x86_64 7077.134.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.156 Safari/537.36",
43 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/7.1.7 Safari/537.85.16",
44 "Mozilla/5.0 (Windows NT 6.0; rv:40.0) Gecko/20100101 Firefox/40.0",
45 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:40.0) Gecko/20100101 Firefox/40.0",
46 "Mozilla/5.0 (iPad; CPU OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B466 Safari/600.1.4",
47 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18",
48 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
49 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
50 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko",
51 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
52 "Mozilla/5.0 (iPad; CPU OS 8_1_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B440 Safari/600.1.4",
53 "Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; KFTT Build/IML74K) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",
54 "Mozilla/5.0 (iPad; CPU OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4",
55 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
56 "Mozilla/5.0 (iPad; CPU OS 7_1_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D201 Safari/9537.53",
57 "Mozilla/5.0 (Linux; U; Android 4.4.3; en-us; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",
58 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/7.1.6 Safari/537.85.15",
59 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.4.10 (KHTML, like Gecko) Version/8.0.4 Safari/600.4.10",
60 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:40.0) Gecko/20100101 Firefox/40.0",
61 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2",
62 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) CriOS/45.0.2454.68 Mobile/12H321 Safari/600.1.4",
63 "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; rv:11.0) like Gecko",
64 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
65 "Mozilla/5.0 (iPad; CPU OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B410 Safari/600.1.4",
66 "Mozilla/5.0 (iPad; CPU OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",
67 "Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
68 "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; rv:11.0) like Gecko",
69 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; TNJB; rv:11.0) like Gecko",
70 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
71 "Mozilla/5.0 (Windows NT 6.3; ARM; Trident/7.0; Touch; rv:11.0) like ",
72 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
73 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0",
74 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MDDCJS; rv:11.0) like Gecko",
75 "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
76 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
77 "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
78 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H143 Safari/600.1.4",
79 "Mozilla/5.0 (Linux; U; Android 4.4.3; en-us; KFASWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",
80 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) GSA/7.0.55539 Mobile/12H321 Safari/600.1.4",
81 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36",
82 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
83 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
84 "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
85 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0",
86 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0",
87 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4",
88 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MATBJS; rv:11.0) like Gecko",
89 "Mozilla/5.0 (Linux; U; Android 4.0.4; en-us; KFJWI Build/IMM76D) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",
90 "Mozilla/5.0 (iPad; CPU OS 7_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D167 Safari/9537.53",
91 "Mozilla/5.0 (X11; CrOS armv7l 7077.134.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.156 Safari/537.36",
92 "Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
93 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
94 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",
95 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5",
96 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36",
97 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
98 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
99 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
100 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",
101 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0",
102]
src/spiders/Investopedia.py
1import scrapy
2from src.items import InvestopediaScraperItem
3
4
5class InvestopediaSpider(scrapy.Spider):
6 # can scrape 952 articles in 73 seconds
7
8 name = "InvestopediaSpider"
9 allowed_domains = ["www.investopedia.com"]
10
11 def __init__(self):
12 super().__init__()
13 self._id_counter = 0
14
15 def start_requests(self):
16 for url in self.settings.get('START_URLS', None):
17 yield scrapy.Request(url=url, callback=self.parse)
18
19 def articles_links(self, response):
20 for article in response.css('#mntl-taxonomysc-article-list_1-0 > a'):
21 if self.check_blacklisted(article.css('::attr(href)').get()):
22 self._id_counter += 1
23 article_image = article.css('div > div > img ::attr(data-src)').get()
24
25 yield response.follow(
26 article.css('::attr(href)').get(),
27 callback=self.parse_articles,
28 cb_kwargs={"article_image": article_image}
29 )
30
31 def parse(self, response, **kwargs):
32 for link in response.css('#mntl-taxonomy-sibling-node__container_1-0 > a ::attr(href)').getall():
33 yield response.follow(link, callback=self.articles_links)
34 self.logger.info("Crawling: %s", response.url)
35
36 @staticmethod
37 def check_blacklisted(link: str) -> bool:
38 link_blacklisted = True
39 for blacklist in ['simulator', 'about-us', 'financial-term', 'newsletter', 'academy', '/terms/', '/careers']:
40 if blacklist in link: link_blacklisted = False
41 return link_blacklisted
42
43 @staticmethod
44 def parse_articles(response, article_image):
45 item = InvestopediaScraperItem()
46
47 item['image'] = article_image
48 item['title'] = response.css('header > h1 ::text').get()
49 item['category'] = response.css('#breadcrumbs_1-0 > li > a > span ::text').extract()
50 item['author'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[1]/div[1]/div/a/text()').get()
51 item['reviewer'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[2]/div').css('div > a ::text').get()
52 item['facts_checker'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[3]').css('div > a ::text').get()
53 item['contributors'] = response.xpath('//*[@id="main"]/article/div[2]/header/div[2]/div/div/div/div').css('::attr(href)').getall()
54 item['date'] = response.css('div.mntl-attribution__item-date ::text').get()
55 item["link"] = response.url
56 item['paragraph'] = response.css('#article-body_1-0 > #mntl-sc-page_1-0').css('::text').extract()
57
58 yield item
src/spiders/__init__.py
1# This package will contain the spiders of your Scrapy project
2#
3# Please refer to the documentation for information on how to create and manage
4# your spiders.
Developer
Maintained by Community
Actor Metrics
8 monthly users
-
5 stars
>99% runs succeeded
Created in Jan 2024
Modified a year ago
Categories