
Autoscout24 Germany / Deutschland - Scraper
Pricing
$9.99 / 1,000 listings
Go to Store

Autoscout24 Germany / Deutschland - Scraper
Autoscout24-Scraper für letzte Listungen in ganz Deutschland mit Leasing-Daten, Preisdaten, weitere Fahrzeugdaten, Adressdaten, google maps links...
0.0 (0)
Pricing
$9.99 / 1,000 listings
0
Total users
1
Monthly users
1
Runs succeeded
>99%
Last modified
2 days ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "autoscout24-scraper", "title": "Autoscout24 Car Listings Scraper", "description": "Professioneller Scraper für Fahrzeugangebote auf Autoscout24. Extrahiert Preise, technische Daten, Verkäuferinformationen und Leasing-Konditionen in Echtzeit.", "version": "1.0.0", "buildTag": "latest", "meta": { "templateId": "python-start", "categories": ["DATA EXTRACTION", "CARS"], "features": [ "proxy support", "dataset output", "pagination" ] }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Autoscout24 Scraper Konfiguration", "type": "object", "schemaVersion": 1, "properties": { "countries": { "title": "Länderauswahl", "type": "object", "description": "Länder und ISO-Codes (z.B. {'Deutschland': 'DE'})", "editor": "json", "default": {"Deutschland": "DE"} }, "max_page": { "title": "Maximale Seiten", "type": "integer", "description": "Maximal zu scrapende Seiten pro Land (1-50)", "minimum": 1, "maximum": 50, "default": 5 }, "delay_min": { "title": "Minimale Verzögerung (Sekunden)", "type": "integer", "description": "Mindestverzögerung zwischen Anfragen", "default": 1, "minimum": 0 }, "delay_max": { "title": "Maximale Verzögerung (Sekunden)", "type": "integer", "description": "Maximalverzögerung zwischen Anfragen", "default": 3, "minimum": 1 }, "batch_size": { "title": "Batch-Größe", "type": "integer", "description": "Anzahl der Datensätze pro Speichervorgang (min. 10)", "default": 100, "minimum": 10 }, "base_url": { "title": "Basis-URL", "type": "string", "description": "Autoscout24 Basis-URL", "default": "https://www.autoscout24.de", "editor": "textfield" } }, "required": []}
src/__init__.py
1
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/main.py
1from apify import Actor2from bs4 import BeautifulSoup3from httpx import AsyncClient4import logging5import re6import json7import hashlib8from urllib.parse import urljoin, urlparse, parse_qs, unquote9import random10import time11from datetime import datetime12from tenacity import retry, stop_after_attempt, wait_exponential13import asyncio 14
15# Configure logging16logging.basicConfig(level=logging.INFO)17
18class ApifyScraperConfig:19 """Configuration class adapted for Apify"""20 def __init__(self, actor_input):21 self.countries = actor_input.get('countries', {'Deutschland': 'D'})22 self.base_url = actor_input.get('base_url', 'https://www.autoscout24.de')23 self.max_page = actor_input.get('max_page', 5)24 self.delay_range = (1, 3)25 self.batch_size = actor_input.get('batch_size', 100)26
27
28def _sanitize_string(raw_str: str) -> str:29 """30 Sanitizes a string for use as a column name:31 - Converts German umlauts and ß to ASCII equivalents.32 - Replaces any other non-alphanumeric characters (except underscore) with an underscore.33 - Converts to lowercase.34 """35 # Convert German umlauts and ß36 replacements = {37 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',38 'Ä': 'ae', 'Ö': 'oe', 'Ü': 'ue', "€": 'eur'39 }40 for umlaut, ascii_eq in replacements.items():41 raw_str = raw_str.replace(umlaut, ascii_eq)42 43 raw_str = raw_str.strip()44 45 # Replace any character that's not alphanumeric or underscore with an underscore46 sanitized = re.sub(r'[^0-9a-zA-Z_]', '_', raw_str)47 48 return sanitized.lower()49
50
51def _class_starts_with(prefix):52 """Helper function to check if a CSS class starts with a specific prefix."""53 def checker(css_class):54 return css_class and css_class.startswith(prefix)55 return checker56
57
58def _remove_leasing_configurator(tag):59 """Filter function to exclude leasing configurator tags."""60 return any("LeasingConfigurator" not in class_name for class_name in tag.get("class", []))61
62
63def _get_data_hash(car_data):64 """Generate a consistent hash for car data to identify duplicates."""65 # Exclude 'scrape_date' and 'url' for hash to identify same car across listings66 data_to_hash = {k: v for k, v in car_data.items() if k not in ('scrape_date', 'url')}67 # Sort keys to ensure consistent JSON string68 data_str = json.dumps(data_to_hash, sort_keys=True)69 return hashlib.md5(data_str.encode()).hexdigest()70
71
72def _safe_get_text(tag):73 """Safely get text from a tag, returning None if tag is None."""74 if tag:75 return tag.text.strip()76 return None77
78
79def _parse_currency(text):80 """Parse currency values from text."""81 if not text:82 return None83 numbers = re.findall(r'[0-9,.]+', text)84 if numbers:85 # Replace comma with dot and remove thousands separators86 return re.sub(r'\.(?=\d{3})', '', numbers[0].replace(',', '.'))87 return None88
89
90def _scrape_price(soup):91 """Extract price information from the car detail page."""92 # Find the main container93 main_container = soup.find("div", {"data-testid": "price-section"})94 if not main_container:95 return "Not found", "Not found", "Not found"96
97 # Extract price98 price_span = main_container.find("span", class_=_class_starts_with("PriceInfo_price__"))99 price = price_span.find(text=True).strip() if price_span else "Not found"100
101 # Extract additional info102 additional_info = main_container.find("p", class_=_class_starts_with("AdditionalPriceInfo_additionalPrice__"))103 additional_text = additional_info.text.strip() if additional_info else "Not found"104
105 # Extract price rating106 price_rating_div = main_container.find("div", class_=_class_starts_with("scr-price-label"))107 price_rating = price_rating_div.find("p").text.strip() if price_rating_div and price_rating_div.find("p") else "Not found"108
109 return price, additional_text, price_rating110
111
112@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))113async def fetch_overview_page(country_code, page, base_url, client):114 """Async version of overview page fetcher"""115 url = f"{base_url}/lst/?sort=age&desc=1&ustate=N%2CU&size=20&page={page}&cy={country_code}&atype=C&"116 try:117 response = await client.get(url, follow_redirects=True)118 response.raise_for_status()119 soup = BeautifulSoup(response.text, "lxml")120 return soup.find_all("a")121 except Exception as e:122 logging.error(f"Error fetching overview page {url}: {str(e)}")123 return []124
125
126@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))127async def fetch_car_details(car_url, base_url, client):128 """Async version of car details fetcher"""129 full_url = urljoin(base_url, car_url)130 try:131 response = await client.get(full_url, follow_redirects=True)132 response.raise_for_status()133 soup = BeautifulSoup(response.text, "lxml")134
135 car_dict = {"url": full_url} # Store the full URL for reference136
137 # Extract dt/dd pairs138 all_dt_tags = soup.find_all("dt")139 dt_tags = list(filter(_remove_leasing_configurator, all_dt_tags))140 dd_tags = soup.find_all("dd")141 142 if len(dt_tags) == len(dd_tags):143 for dt, dd in zip(dt_tags, dd_tags):144 raw_key = dt.get_text(strip=True)145 sanitized_key = _sanitize_string(raw_key)146 value = dd.get_text(separator=";", strip=True)147 car_dict[sanitized_key] = value148 else:149 logging.warning("Mismatch in dt/dd pairs in %s", full_url)150
151 # Additional fields152 # seller type - Initialize with a default value153 seller_type = None154 155 # Strategie 1: SCR-Tag mit aria-label156 seller_tag = soup.find("span", class_="scr-tag", attrs={"aria-label": lambda x: x and x.lower() in ["händler", "privat"]})157 if seller_tag:158 seller_type = seller_tag.text.strip()159 else:160 # Strategie 2: Kontextbasierte Suche im Vendor-Bereich161 vendor_section = soup.find("div", id="vendor-and-cta-section") or soup.find(class_=lambda x: x and "VendorData" in x)162 if vendor_section:163 vendor_tag = vendor_section.find("span", class_="scr-tag")164 seller_type = _safe_get_text(vendor_tag)165 166 car_dict["seller_type"] = seller_type if seller_type else "Not found"167 car_dict["is_dealer"] = (seller_type == "Händler") if seller_type else "Not found"168 car_dict["is_private"] = (seller_type == "Privat") if seller_type else "Not found"169
170 # leasing section171 leasing_data = {}172 try:173 leasing_section = soup.find("div", {"data-cy": "leasing-section"}) or soup.find("section", id="leasing-section")174 175 if leasing_section:176 # Allgemeine Leasing-Daten177 leasing_tags = leasing_section.find_all("dl", class_=lambda x: x and "DataGrid_asTable" in x)178 for dl in leasing_tags:179 for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):180 key = _sanitize_string(dt.get_text(strip=True))181 value = dd.get_text(strip=True)182 leasing_data[key] = value183 184 # Wichtige Kennzahlen185 monthly_rate_text = leasing_section.find(text=re.compile(r"Monatliche Rate", re.IGNORECASE))186 if monthly_rate_text and monthly_rate_text.find_next("dd"):187 leasing_data["monthly_rate"] = _parse_currency(monthly_rate_text.find_next("dd").text)188 189 # Target Group aus dem SCR-Tag190 target_group_tag = leasing_section.find("span", class_="scr-tag")191 if target_group_tag:192 leasing_data["target_group"] = [x.strip() for x in target_group_tag.text.split("/")]193 194 # Raw-Daten für Debugging195 car_dict["leasing_raw"] = json.dumps(leasing_data)196 197 # Normalisierte Felder198 car_dict["leasing_monthly"] = leasing_data.get("monthly_rate")199 car_dict["leasing_total"] = _parse_currency(leasing_data.get("leasinggesamtbetrag"))200 car_dict["leasing_contract_type"] = leasing_data.get("Vertragsart")201 car_dict["leasing_mileage"] = leasing_data.get("Fahrleistung p.a.")202 except Exception as e:203 logging.warning(f"Fehler beim Leasing-Extract: {str(e)}")204 car_dict["leasing_error"] = str(e)205
206 # location207 location_a = soup.find("a", class_="scr-link Department_link__xMUEe")208 if location_a:209 maps_link = location_a.get("href", "")210 address_text = location_a.get_text(separator=" ", strip=True) # ersetzt <br> durch Leerzeichen211 address_clean = re.sub(r'\s+,', ',', address_text)212 car_dict["address"] = address_clean if address_clean else None213 car_dict["maps_link"] = maps_link214 else:215 car_dict["address"] = None216 car_dict["maps_link"] = None217
218 # price219 price_raw, additional_text, price_rating = _scrape_price(soup)220 car_dict["price_raw"] = price_raw221 # leasing info222 car_dict["is_leasing_only"] = False223 if price_raw and price_raw != "Not found":224 # Case-insensitive Suche nach Leasing-Indikatoren225 if re.search(r'(?i)\b(leasing|mtl?\.?)\b', price_raw):226 car_dict["is_leasing_only"] = True227
228 # Add numeric price229 if price_raw and price_raw != "Not found":230 price_numbers = "".join(re.findall(r'[0-9]+', price_raw))231 car_dict["price"] = price_numbers232 else:233 car_dict["price"] = None234 car_dict["price_text"] = additional_text235 car_dict["price_rating"] = price_rating236
237 # Extract equipment list238 ausstattung_set = set()239 equip_divs = soup.find_all("div", attrs={"class": re.compile(r"cldt-equipment-block")})240 for div in equip_divs:241 text = div.get_text(separator="\n", strip=True)242 for line in text.split("\n"):243 if line:244 ausstattung_set.add(line)245 car_dict["ausstattung_liste"] = sorted(ausstattung_set)246
247 car_dict["date"] = datetime.now().isoformat()248 car_dict["listing_hash"] = _get_data_hash(car_dict)249 250 return car_dict251 except Exception as e:252 logging.error(f"Error processing {full_url}: {str(e)}")253 return None254
255
256async def process_car_listings(actor_input):257 """Main processing logic adapted for Apify"""258 async with AsyncClient() as client:259 config = ApifyScraperConfig(actor_input)260 visited_urls = await Actor.get_value('visited-urls') or set()261 262 for country, country_code in config.countries.items():263 for page in range(1, config.max_page + 1):264 # Remove await from logging265 Actor.log.info(f"Processing page {page} for {country}")266 a_tags = await fetch_overview_page(country_code, page, config.base_url, client)267 car_urls = {link["href"] for link in a_tags if "/angebote/" in link.get("href", "")}268 269 new_urls = car_urls - visited_urls270 if not new_urls:271 break272 273 for url in new_urls:274 car_data = await fetch_car_details(url, config.base_url, client)275 if car_data:276 await Actor.push_data(car_data)277 visited_urls.add(url)278 await Actor.set_value('visited-urls', visited_urls)279 280 # Use proper async sleep281 await asyncio.sleep(random.uniform(*config.delay_range))282
283
284async def main() -> None:285 async with Actor:286 actor_input = await Actor.get_input() or {}287 288 # Initialize visited URLs dataset289 await Actor.open_dataset(name='visited-urls')290 291 # Run main processing292 await process_car_listings(actor_input)
src/py.typed
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05beautifulsoup4[lxml]6httpx7types-beautifulsoup48apify-client9beautifulsoup410httpx11tenacity12python-dateutil