
Investopedia Scraper
Pricing
Pay per usage
Go to Store

Investopedia Scraper
Investopedia Scraper will enable you to get all the Articles you want from Investopedia.com.
0.0 (0)
Pricing
Pay per usage
5
Total users
65
Monthly users
4
Runs succeeded
>99%
Last modified
a year ago
.dockerignore
# Git folder.git
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.gitignore
# IDE.idea/.vscode/.DS_Store
# Apify storage folderstorage/
# Python virtual environment.venv/.env/
# Python (and python tools) cache files__pycache__/*.pyc.ruff_cache/.mypy_cache/.pytest_cache/
# Python build files__pypackages__/dist/build/*.egg-info/*.egg
# log files*.log
package.json
{ "name": "InvestopediaScraper", "version": "1.0.0", "description": "Investopedia Scraper will enable you to get all the Articles you want from Investopedia.com since they have no API.", "main": "src/main.py", "scripts": { "start": "python3 src/__main__.py", "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": ["investopedia.com", "investopedia", "news scraper", "scraper"], "author": "Glitch_404", "engines": { "node": ">=16.0.0", "npm": ">=8.0.0" }, "license": "ISC", "dependencies": { "apify": "^0.19.1", "scrapy": "^2.11.0" }}
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify[scrapy] ~= 1.5.35nest-asyncio ~= 1.5.86scrapy ~= 2.11.07itemadapter~=0.8.0
scrapy.cfg
# Automatically created by: scrapy startproject## For more information about the [deploy] section see:# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]default = src.settingsshell = 'ipython'
[deploy]#url = http://localhost:6800/project = src
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.11
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "Investopedia-Scraper", "title": "Investopedia-Scraper", "description": "Scrapes articles of investopedia.com using Scrapy", "buildTag": "latest", "version": "1.0", "readme": "./readme.md", "meta": { "templateId": "python-scrapy" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": "./dataset_schema.json" }, "minMemoryMbytes": 1024, "maxMemoryMbytes": 4096}
.actor/dataset_schema.json
{ "actorSpecification": 1, "views": { "overview": { "title": "Overview", "transformation": { "fields": [ "image", "title", "category", "author", "reviewer", "facts_checker", "contributors", "date", "link", "paragraph" ] }, "display": { "component": "table", "properties": { "Image": { "label": "Image", "format": "image" }, "Title": { "label": "Text", "format": "text" }, "Category": { "label": "Text", "format": "Text" }, "Author": { "label": "Text", "format": "text" }, "Reviewer": { "label": "Text", "format": "Text" }, "FactsChecker": { "label": "FactsChecker", "format": "Text" }, "Contributors": { "label": "Array", "format": "array" }, "Date": { "label": "Date", "format": "date" }, "Link": { "label": "Link", "format": "link" }, "Paragraph": { "label": "Text", "format": "Text" } } } } }}
.actor/input_schema.json
{ "title": "Python-Scrapy-InvestopediaScraper", "type": "object", "schemaVersion": 1.0, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "URLs to start with", "prefill": [ {"url": "https://www.investopedia.com/stocks-4427785"}, {"url": "https://www.investopedia.com/savings-accounts-4689728"}, {"url": "https://www.investopedia.com/personal-finance-4427760"}, {"url": "https://www.investopedia.com/markets-news-4427704"}, {"url": "https://www.investopedia.com/best-online-brokers-4587872"} ], "editor": "requestListSources" }, "dateRange": { "title": "Date Range", "type": "string", "description": "Scrape only in specific date range\nDate Can Only be is the formats\n\n* YYYY/M/D e.g. 2023/5/23 to get all the articles that have the same day\n\n* from YYYY/M/D to YYYY/M/D e.g. 2024/1/25 - 2024/1/30 to get a range of 5 days\n pay attention to the formate (spaces and characters) YYYY/M/D - YYYY/M/D\n\n* 25 days or 5 weeks or 3 months or 2 years\n only supported formates (day, week, month, year)\n this feature will always scrape from today's date till the date specified\ne.g. 2024/1/30 is today and you entered 5 days this will scrape from 2024/1/25 to 2024/1/30\n\n* if you don't care about the date and just want 100 aritcles, more or less just enter 'anytime' as a value e.g. anytime\n\n* Allowed Formats: 2023/1/25 - 2023/1/30 or 2020/5/26 or 3 days or anytime\n\n* Leave blank to scrape today's date articles", "prefill": "anytime", "editor": "textfield" }, "maxArticles": { "title": "Maximum Articles Amount", "type": "integer", "description": "Choose how many articles do you want", "prefill": 100, "default": 100, "minimum": 1, "maximum": 1000000 }, "proxyConfiguration": { "sectionCaption": "Proxy and HTTP configuration", "title": "Proxy configuration", "type": "object", "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.", "editor": "proxy", "prefill": { "useApifyProxy": true }, "default": { "useApifyProxy": true } } }, "required": ["startUrls", "maxArticles"]}
src/__init__.py
1
src/__main__.py
1"""2This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's3logging system, and establishing the required environment to run the Scrapy spider within the Apify platform.4
5This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally6or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using7`scrapy crawl title_spider`.8
9We recommend you do not modify this file unless you really know what you are doing.10"""11
12# We need to configure the logging first before we import anything else, so that nothing else imports13# `scrapy.utils.log` before we patch it.14from __future__ import annotations15from logging import StreamHandler, getLogger16from typing import Any17from scrapy.utils import log as scrapy_logging18from scrapy.utils.project import get_project_settings19from apify.log import ActorLogFormatter20
21# Define names of the loggers.22APIFY_LOGGER_NAMES = ['apify', 'apify_client']23SCRAPY_LOGGER_NAMES = ['filelock', 'hpack', 'httpx', 'scrapy', 'twisted']24ALL_LOGGER_NAMES = APIFY_LOGGER_NAMES + SCRAPY_LOGGER_NAMES25
26# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,27# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for28# a specific logger, do it in this file.29settings = get_project_settings()30LOGGING_LEVEL = settings['LOG_LEVEL']31
32# Define a logging handler which will be used for the loggers.33apify_handler = StreamHandler()34apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True))35
36
37def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:38 """39 Configure a logger with the specified settings.40
41 Args:42 logger_name: The name of the logger to be configured.43 log_level: The desired logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', ...).44 handlers: Optional list of logging handlers.45 """46 logger = getLogger(logger_name)47 logger.setLevel(log_level)48 logger.handlers = []49
50 for handler in handlers:51 logger.addHandler(handler)52
53
54# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from55# the `main.py` and Scrapy components.56for logger_name in APIFY_LOGGER_NAMES:57 configure_logger(logger_name, LOGGING_LEVEL, apify_handler)58
59# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`60# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though61# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method62# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because63# otherwise we would lose some log messages.64old_configure_logging = scrapy_logging.configure_logging65
66
67def new_configure_logging(*args: Any, **kwargs: Any) -> None:68 """69 We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root70 logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary71 loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here72 these four loggers and the root logger.73 """74 old_configure_logging(*args, **kwargs)75
76 # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger`77 # property within spiders. See details in the Spider logger property:78 # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46.79 configure_logger(None, LOGGING_LEVEL, apify_handler)80
81 # We modify other loggers only by setting up their log level. A custom log handler is added82 # only to the root logger to avoid duplicate log messages.83 for logger_name in ALL_LOGGER_NAMES:84 configure_logger(logger_name, LOGGING_LEVEL)85
86 # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless87 # messages, especially when running on the platform.88 configure_logger('httpx', 'WARNING')89
90
91scrapy_logging.configure_logging = new_configure_logging92
93# Now we can do the rest of the setup94import asyncio95import os96import nest_asyncio97from scrapy.utils.reactor import install_reactor98from src.main import main99
100# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),101# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor102install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')103nest_asyncio.apply()104
105# Specify the path to the Scrapy project settings module106os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'107
108# Run the Apify main coroutine109asyncio.run(main())
src/items.py
1import scrapy2
3
4class InvestopediaScraperItem(scrapy.Item):5 image = scrapy.Field(default=None, serializer=str)6 title = scrapy.Field(default=False, required=True, serializer=str)7 category = scrapy.Field(default=None, serializer=str)8 author = scrapy.Field(default=None, serializer=str)9 reviewer = scrapy.Field(default=None, serializer=str)10 facts_checker = scrapy.Field(default=None, serializer=str)11 contributors = scrapy.Field(default=None, serializer=str)12 date = scrapy.Field(default=None, serializer=str)13 paragraph = scrapy.Field(required=True, serializer=str)14 link = scrapy.Field(required=True, serializer=str)
src/main.py
1from __future__ import annotations2
3from scrapy.crawler import CrawlerProcess4from scrapy.utils.project import get_project_settings5from scrapy.settings import Settings6from apify import Actor7
8# Import your Scrapy spider here9from src.spiders.Investopedia import InvestopediaSpider as Spider10
11# Default input values for local execution using `apify run`12LOCAL_DEFAULT_START_URLS = [13 {'url': "https://www.investopedia.com/stocks-4427785"},14 {'url': "https://www.investopedia.com/savings-accounts-4689728"},15 {'url': "https://www.investopedia.com/personal-finance-4427760"},16 {'url': "https://www.investopedia.com/markets-news-4427704"},17 {'url': "https://www.investopedia.com/best-online-brokers-4587872"},18]19
20
21def __custom_settings(start_urls, max_articles, date_range, proxy_config) -> Settings:22 settings = get_project_settings()23
24 # Use ApifyScheduler as the scheduler25 # settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler' # cases bugs26
27 # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),28 # ensuring it is executed as the final step in the pipeline sequence29 settings['ITEM_PIPELINES']['src.pipelines.ActorPushPipeline'] = 100030
31 # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt32 settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None33
34 # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware35 settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None36 settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 95037
38 # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000) # cases bugs39 # settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None40 # settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 100041
42 # Store the proxy configuration43 settings['APIFY_PROXY_SETTINGS'] = proxy_config44
45 # Applying Custom Inputs to settings46 settings['ALLOWED_DATE_RANGE'] = date_range47 settings['CLOSESPIDER_ITEMCOUNT'] = max_articles48 settings['START_URLS'] = start_urls49
50 return settings51
52
53async def main() -> None:54 """55 Apify Actor main coroutine for executing the Scrapy spider.56 """57 async with Actor:58 Actor.log.info('Actor is being executed...')59
60 # Process Actor input61 actor_input = await Actor.get_input() or {}62
63 start_urls = [url.get('url') for url in actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)]64 date_range = actor_input.get('dateRange', None)65 max_articles = actor_input.get('maxArticles', 100)66 proxy_config = actor_input.get('proxyConfiguration')67
68 Actor.log.info(f'\nstartUrls: {start_urls}\nmaxArticles: {max_articles}\ndateRange: {date_range}\n')69
70 # Apply Apify settings, it will override the Scrapy project settings71 settings = __custom_settings(start_urls, max_articles, date_range, proxy_config)72
73 # Execute the spider using Scrapy CrawlerProcess74 Actor.log.info('crawling started')75
76 process = CrawlerProcess(settings)77 process.crawl(Spider)78 process.start()
src/middlewares.py
1import random2from src.user_agent import user_agents3
4
5class InvestopediaFakeHeadersMiddleware:6 def process_request(self, request, spider):7 request.headers['Accept'] = {"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}8 request.headers['Accept-Encoding'] = {"Accept-Encoding": 'gzip, deflate, br'}9 request.headers['Sec-Ch-Ua-Platform'] = {"Sec-Ch-Ua-Platform": random.choice(['Windows', 'Linux', 'MacOS'])}10 request.headers['User-Agent'] = {"User-Agent": random.choice(user_agents)}
src/pipelines.py
1import re2import math3import datetime4import apify.actor5import apify.scrapy.pipelines6
7from scrapy import Item, Spider8from itemadapter import ItemAdapter9from scrapy.exceptions import DropItem, CloseSpider10
11
12class ActorPushPipeline(apify.scrapy.pipelines.ActorDatasetPushPipeline):13 """14 A Scrapy pipeline for pushing items to an Actor's default dataset.15 This pipeline is designed to be enabled only when the Scrapy project is run as an Actor.16 """17
18 def __init__(self, item_count):19 self.item_count = item_count20 self.counter = 021
22 @classmethod23 def from_crawler(cls, crawler):24 item_count = crawler.settings.get('CLOSESPIDER_ITEMCOUNT')25 return cls(int(item_count))26
27 async def process_item(self, item: Item, spider: Spider) -> Item:28 """Pushes the provided Scrapy item to the Actor's default dataset."""29 self.counter += 130 if self.counter <= self.item_count:31 item_dict = ItemAdapter(item).asdict()32 await apify.actor.Actor.push_data(item_dict)33 spider.logger.info(f'item #{item["_id"]} pushed to the dataset.')34
35
36class InvestopediaSpiderPipeline:37 def __init__(self, allowed_date_range):38 self.today = datetime.datetime.today().date()39 self.months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']40 self.allowed_date_range = allowed_date_range41 self.drops_counter = 042
43 @classmethod44 def from_crawler(cls, crawler):45 allowed_date_range = crawler.settings.get('ALLOWED_DATE_RANGE', None)46 return cls(allowed_date_range)47
48 def __is_allowed_date(self, item, article_date, allowed_date_range):49 if '-' not in allowed_date_range and '/' in allowed_date_range:50 if (date := datetime.date(*list(map(int, allowed_date_range.split('/'))))) != article_date:51 raise DropItem(f"dropped item #{item} date {article_date} not in requested date range {date}")52
53 elif '-' in allowed_date_range and '/' in allowed_date_range:54 date = allowed_date_range.split('-')55 start_date = datetime.date(*list(map(int, date[0].split('/'))))56 end_date = datetime.date(*list(map(int, date[1].split('/'))))57
58 if start_date < article_date < end_date:59 raise DropItem(f"dropped item #{item} article date {article_date} not in requested date range")60
61 elif any([word in allowed_date_range for word in ['day', 'week', 'year', 'month']]):62 if 'day' in allowed_date_range:63 days_amount = int(allowed_date_range.replace('day', ''))64 elif 'week' in allowed_date_range:65 days_amount = math.floor(float(allowed_date_range.replace('week', '')) * 7)66 elif 'month' in allowed_date_range:67 days_amount = math.floor(float(allowed_date_range.replace('month', '')) * 30)68 elif 'year' in allowed_date_range:69 days_amount = math.floor(float(allowed_date_range.replace('year', '')) * 365.25)70 else:71 raise CloseSpider(f"{allowed_date_range} date requested is malformed can't be processed")72
73 if self.today - datetime.timedelta(days=days_amount) > article_date:74 raise DropItem(f"dropped item #{item} article date {article_date} not in requested date range")75 else:76 raise CloseSpider(f"{allowed_date_range} date requested is malformed can't be processed")77
78 return True79
80 def process_item(self, item: Item, spider: Spider) -> Item:81 spider.logger.info(f'Processing item #{item}')82
83 if self.drops_counter >= 50:84 raise CloseSpider(f"[DropLimit] Drop limit reached: {self.drops_counter} drops")85
86 article_date = item['date'].replace(',', '').lower().split(' ')[1:]87 article_date = datetime.date(int(article_date[2]), self.months.index(article_date[0]) + 1, int(article_date[1]))88 try:89 if self.allowed_date_range:90 self.allowed_date_range = self.allowed_date_range.lower().replace(" ", "").replace('s', '')91 if 'anytime' not in self.allowed_date_range:92 self.__is_allowed_date(item, article_date, self.allowed_date_range)93 else:94 if article_date != self.today:95 raise DropItem(f"item {item} not from today's date")96 except DropItem as e:97 self.drops_counter += 198 raise DropItem(f"item #{item} dropped due to ERROR: {e}")99 else:100 self.drops_counter = 0101
102 item['contributors'] = [f"{contributor}" for contributor in item['contributors'] if 'http' in contributor] or None103 item['date'] = article_date104 item['title'] = item['title'].strip()105 item['category'] = ','.join(item['category'])106 item['paragraph'] = re.sub(r"\n{2,}", "\n", ''.join(item['paragraph']).strip())107
108 return item
src/settings.py
1BOT_NAME = "InvestopediaCrawler"2SPIDER_MODULES = ["src.spiders"]3NEWSPIDER_MODULE = "src.spiders"4
5CLOSESPIDER_ITEMCOUNT = 1006ALLOWED_DATE_RANGE = None7START_URLS = ["https://www.investopedia.com/investing-in-the-uk-7548232"]8
9DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'10ROBOTSTXT_OBEY = True11COOKIES_ENABLED = True12
13# LOG_FILE = 'scrapy.log'14# LOG_FILE_APPEND = False15LOG_LEVEL = 'INFO'16
17DOWNLOADER_MIDDLEWARES = {18 "src.middlewares.InvestopediaFakeHeadersMiddleware": 300,19}20ITEM_PIPELINES = {21 'src.pipelines.InvestopediaSpiderPipeline': 10022}23
24REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"25TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"26FEED_EXPORT_ENCODING = "utf-8"
src/user_agent.py
1user_agents = [2 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",3 "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",4 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",5 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9",6 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",7 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",8 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",9 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240",10 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36",11 "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",12 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",13 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",14 "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",15 "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",16 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12",17 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",18 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0",19 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/7.1.8 Safari/537.85.17",20 "Mozilla/5.0 (iPad; CPU OS 8_4 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H143 Safari/600.1.4",21 "Mozilla/5.0 (iPad; CPU OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F69 Safari/600.1.4",22 "Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0",23 "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",24 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",25 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",26 "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0",27 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",28 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/8.0.6 Safari/600.6.3",29 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17",30 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",31 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",32 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4",33 "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",34 "Mozilla/5.0 (iPad; CPU OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",35 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",36 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",37 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",38 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:40.0) Gecko/20100101 Firefox/40.0",39 "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",40 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",41 "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",42 "Mozilla/5.0 (X11; CrOS x86_64 7077.134.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.156 Safari/537.36",43 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/7.1.7 Safari/537.85.16",44 "Mozilla/5.0 (Windows NT 6.0; rv:40.0) Gecko/20100101 Firefox/40.0",45 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:40.0) Gecko/20100101 Firefox/40.0",46 "Mozilla/5.0 (iPad; CPU OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B466 Safari/600.1.4",47 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18",48 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",49 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",50 "Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko",51 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",52 "Mozilla/5.0 (iPad; CPU OS 8_1_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B440 Safari/600.1.4",53 "Mozilla/5.0 (Linux; U; Android 4.0.3; en-us; KFTT Build/IML74K) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",54 "Mozilla/5.0 (iPad; CPU OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4",55 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",56 "Mozilla/5.0 (iPad; CPU OS 7_1_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D201 Safari/9537.53",57 "Mozilla/5.0 (Linux; U; Android 4.4.3; en-us; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",58 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/7.1.6 Safari/537.85.15",59 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.4.10 (KHTML, like Gecko) Version/8.0.4 Safari/600.4.10",60 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:40.0) Gecko/20100101 Firefox/40.0",61 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2",62 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) CriOS/45.0.2454.68 Mobile/12H321 Safari/600.1.4",63 "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; rv:11.0) like Gecko",64 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",65 "Mozilla/5.0 (iPad; CPU OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B410 Safari/600.1.4",66 "Mozilla/5.0 (iPad; CPU OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",67 "Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",68 "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; rv:11.0) like Gecko",69 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; TNJB; rv:11.0) like Gecko",70 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",71 "Mozilla/5.0 (Windows NT 6.3; ARM; Trident/7.0; Touch; rv:11.0) like ",72 "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",73 "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:40.0) Gecko/20100101 Firefox/40.0",74 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MDDCJS; rv:11.0) like Gecko",75 "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",76 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",77 "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",78 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_4 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H143 Safari/600.1.4",79 "Mozilla/5.0 (Linux; U; Android 4.4.3; en-us; KFASWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",80 "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) GSA/7.0.55539 Mobile/12H321 Safari/600.1.4",81 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36",82 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",83 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",84 "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",85 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:40.0) Gecko/20100101 Firefox/40.0",86 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0",87 "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4",88 "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; MATBJS; rv:11.0) like Gecko",89 "Mozilla/5.0 (Linux; U; Android 4.0.4; en-us; KFJWI Build/IMM76D) AppleWebKit/537.36 (KHTML, like Gecko) Silk/3.68 like Chrome/39.0.2171.93 Safari/537.36",90 "Mozilla/5.0 (iPad; CPU OS 7_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D167 Safari/9537.53",91 "Mozilla/5.0 (X11; CrOS armv7l 7077.134.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.156 Safari/537.36",92 "Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",93 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",94 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",95 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5",96 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36",97 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",98 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",99 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",100 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",101 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0) Gecko/20100101 Firefox/39.0",102]
src/spiders/Investopedia.py
1import scrapy2from src.items import InvestopediaScraperItem3
4
5class InvestopediaSpider(scrapy.Spider):6 # can scrape 952 articles in 73 seconds7
8 name = "InvestopediaSpider"9 allowed_domains = ["www.investopedia.com"]10
11 def __init__(self):12 super().__init__()13 self._id_counter = 014
15 def start_requests(self):16 for url in self.settings.get('START_URLS', None):17 yield scrapy.Request(url=url, callback=self.parse)18
19 def articles_links(self, response):20 for article in response.css('#mntl-taxonomysc-article-list_1-0 > a'):21 if self.check_blacklisted(article.css('::attr(href)').get()):22 self._id_counter += 123 article_image = article.css('div > div > img ::attr(data-src)').get()24
25 yield response.follow(26 article.css('::attr(href)').get(),27 callback=self.parse_articles,28 cb_kwargs={"article_image": article_image}29 )30
31 def parse(self, response, **kwargs):32 for link in response.css('#mntl-taxonomy-sibling-node__container_1-0 > a ::attr(href)').getall():33 yield response.follow(link, callback=self.articles_links)34 self.logger.info("Crawling: %s", response.url)35
36 @staticmethod37 def check_blacklisted(link: str) -> bool:38 link_blacklisted = True39 for blacklist in ['simulator', 'about-us', 'financial-term', 'newsletter', 'academy', '/terms/', '/careers']:40 if blacklist in link: link_blacklisted = False41 return link_blacklisted42
43 @staticmethod44 def parse_articles(response, article_image):45 item = InvestopediaScraperItem()46
47 item['image'] = article_image48 item['title'] = response.css('header > h1 ::text').get()49 item['category'] = response.css('#breadcrumbs_1-0 > li > a > span ::text').extract()50 item['author'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[1]/div[1]/div/a/text()').get()51 item['reviewer'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[2]/div').css('div > a ::text').get()52 item['facts_checker'] = response.xpath('//*[@id="article-header_1-0"]/div[2]/div/div[3]').css('div > a ::text').get()53 item['contributors'] = response.xpath('//*[@id="main"]/article/div[2]/header/div[2]/div/div/div/div').css('::attr(href)').getall()54 item['date'] = response.css('div.mntl-attribution__item-date ::text').get()55 item["link"] = response.url56 item['paragraph'] = response.css('#article-body_1-0 > #mntl-sc-page_1-0').css('::text').extract()57
58 yield item
src/spiders/__init__.py
1# This package will contain the spiders of your Scrapy project2#3# Please refer to the documentation for information on how to create and manage4# your spiders.