Autoscout24 Germany / Deutschland - Scraper avatar
Autoscout24 Germany / Deutschland - Scraper

Pricing

$9.99 / 1,000 listings

Go to Store
Autoscout24 Germany / Deutschland - Scraper

Autoscout24 Germany / Deutschland - Scraper

Developed by

Alex

Alex

Maintained by Community

Autoscout24-Scraper für letzte Listungen in ganz Deutschland mit Leasing-Daten, Preisdaten, weitere Fahrzeugdaten, Adressdaten, google maps links...

0.0 (0)

Pricing

$9.99 / 1,000 listings

0

Total users

1

Monthly users

1

Runs succeeded

>99%

Last modified

2 days ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "autoscout24-scraper",
"title": "Autoscout24 Car Listings Scraper",
"description": "Professioneller Scraper für Fahrzeugangebote auf Autoscout24. Extrahiert Preise, technische Daten, Verkäuferinformationen und Leasing-Konditionen in Echtzeit.",
"version": "1.0.0",
"buildTag": "latest",
"meta": {
"templateId": "python-start",
"categories": ["DATA EXTRACTION", "CARS"],
"features": [
"proxy support",
"dataset output",
"pagination"
]
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Autoscout24 Scraper Konfiguration",
"type": "object",
"schemaVersion": 1,
"properties": {
"countries": {
"title": "Länderauswahl",
"type": "object",
"description": "Länder und ISO-Codes (z.B. {'Deutschland': 'DE'})",
"editor": "json",
"default": {"Deutschland": "DE"}
},
"max_page": {
"title": "Maximale Seiten",
"type": "integer",
"description": "Maximal zu scrapende Seiten pro Land (1-50)",
"minimum": 1,
"maximum": 50,
"default": 5
},
"delay_min": {
"title": "Minimale Verzögerung (Sekunden)",
"type": "integer",
"description": "Mindestverzögerung zwischen Anfragen",
"default": 1,
"minimum": 0
},
"delay_max": {
"title": "Maximale Verzögerung (Sekunden)",
"type": "integer",
"description": "Maximalverzögerung zwischen Anfragen",
"default": 3,
"minimum": 1
},
"batch_size": {
"title": "Batch-Größe",
"type": "integer",
"description": "Anzahl der Datensätze pro Speichervorgang (min. 10)",
"default": 100,
"minimum": 10
},
"base_url": {
"title": "Basis-URL",
"type": "string",
"description": "Autoscout24 Basis-URL",
"default": "https://www.autoscout24.de",
"editor": "textfield"
}
},
"required": []
}

src/__init__.py

1

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/main.py

1from apify import Actor
2from bs4 import BeautifulSoup
3from httpx import AsyncClient
4import logging
5import re
6import json
7import hashlib
8from urllib.parse import urljoin, urlparse, parse_qs, unquote
9import random
10import time
11from datetime import datetime
12from tenacity import retry, stop_after_attempt, wait_exponential
13import asyncio
14
15# Configure logging
16logging.basicConfig(level=logging.INFO)
17
18class ApifyScraperConfig:
19 """Configuration class adapted for Apify"""
20 def __init__(self, actor_input):
21 self.countries = actor_input.get('countries', {'Deutschland': 'D'})
22 self.base_url = actor_input.get('base_url', 'https://www.autoscout24.de')
23 self.max_page = actor_input.get('max_page', 5)
24 self.delay_range = (1, 3)
25 self.batch_size = actor_input.get('batch_size', 100)
26
27
28def _sanitize_string(raw_str: str) -> str:
29 """
30 Sanitizes a string for use as a column name:
31 - Converts German umlauts and ß to ASCII equivalents.
32 - Replaces any other non-alphanumeric characters (except underscore) with an underscore.
33 - Converts to lowercase.
34 """
35 # Convert German umlauts and ß
36 replacements = {
37 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss',
38 'Ä': 'ae', 'Ö': 'oe', 'Ü': 'ue', "€": 'eur'
39 }
40 for umlaut, ascii_eq in replacements.items():
41 raw_str = raw_str.replace(umlaut, ascii_eq)
42
43 raw_str = raw_str.strip()
44
45 # Replace any character that's not alphanumeric or underscore with an underscore
46 sanitized = re.sub(r'[^0-9a-zA-Z_]', '_', raw_str)
47
48 return sanitized.lower()
49
50
51def _class_starts_with(prefix):
52 """Helper function to check if a CSS class starts with a specific prefix."""
53 def checker(css_class):
54 return css_class and css_class.startswith(prefix)
55 return checker
56
57
58def _remove_leasing_configurator(tag):
59 """Filter function to exclude leasing configurator tags."""
60 return any("LeasingConfigurator" not in class_name for class_name in tag.get("class", []))
61
62
63def _get_data_hash(car_data):
64 """Generate a consistent hash for car data to identify duplicates."""
65 # Exclude 'scrape_date' and 'url' for hash to identify same car across listings
66 data_to_hash = {k: v for k, v in car_data.items() if k not in ('scrape_date', 'url')}
67 # Sort keys to ensure consistent JSON string
68 data_str = json.dumps(data_to_hash, sort_keys=True)
69 return hashlib.md5(data_str.encode()).hexdigest()
70
71
72def _safe_get_text(tag):
73 """Safely get text from a tag, returning None if tag is None."""
74 if tag:
75 return tag.text.strip()
76 return None
77
78
79def _parse_currency(text):
80 """Parse currency values from text."""
81 if not text:
82 return None
83 numbers = re.findall(r'[0-9,.]+', text)
84 if numbers:
85 # Replace comma with dot and remove thousands separators
86 return re.sub(r'\.(?=\d{3})', '', numbers[0].replace(',', '.'))
87 return None
88
89
90def _scrape_price(soup):
91 """Extract price information from the car detail page."""
92 # Find the main container
93 main_container = soup.find("div", {"data-testid": "price-section"})
94 if not main_container:
95 return "Not found", "Not found", "Not found"
96
97 # Extract price
98 price_span = main_container.find("span", class_=_class_starts_with("PriceInfo_price__"))
99 price = price_span.find(text=True).strip() if price_span else "Not found"
100
101 # Extract additional info
102 additional_info = main_container.find("p", class_=_class_starts_with("AdditionalPriceInfo_additionalPrice__"))
103 additional_text = additional_info.text.strip() if additional_info else "Not found"
104
105 # Extract price rating
106 price_rating_div = main_container.find("div", class_=_class_starts_with("scr-price-label"))
107 price_rating = price_rating_div.find("p").text.strip() if price_rating_div and price_rating_div.find("p") else "Not found"
108
109 return price, additional_text, price_rating
110
111
112@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
113async def fetch_overview_page(country_code, page, base_url, client):
114 """Async version of overview page fetcher"""
115 url = f"{base_url}/lst/?sort=age&desc=1&ustate=N%2CU&size=20&page={page}&cy={country_code}&atype=C&"
116 try:
117 response = await client.get(url, follow_redirects=True)
118 response.raise_for_status()
119 soup = BeautifulSoup(response.text, "lxml")
120 return soup.find_all("a")
121 except Exception as e:
122 logging.error(f"Error fetching overview page {url}: {str(e)}")
123 return []
124
125
126@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
127async def fetch_car_details(car_url, base_url, client):
128 """Async version of car details fetcher"""
129 full_url = urljoin(base_url, car_url)
130 try:
131 response = await client.get(full_url, follow_redirects=True)
132 response.raise_for_status()
133 soup = BeautifulSoup(response.text, "lxml")
134
135 car_dict = {"url": full_url} # Store the full URL for reference
136
137 # Extract dt/dd pairs
138 all_dt_tags = soup.find_all("dt")
139 dt_tags = list(filter(_remove_leasing_configurator, all_dt_tags))
140 dd_tags = soup.find_all("dd")
141
142 if len(dt_tags) == len(dd_tags):
143 for dt, dd in zip(dt_tags, dd_tags):
144 raw_key = dt.get_text(strip=True)
145 sanitized_key = _sanitize_string(raw_key)
146 value = dd.get_text(separator=";", strip=True)
147 car_dict[sanitized_key] = value
148 else:
149 logging.warning("Mismatch in dt/dd pairs in %s", full_url)
150
151 # Additional fields
152 # seller type - Initialize with a default value
153 seller_type = None
154
155 # Strategie 1: SCR-Tag mit aria-label
156 seller_tag = soup.find("span", class_="scr-tag", attrs={"aria-label": lambda x: x and x.lower() in ["händler", "privat"]})
157 if seller_tag:
158 seller_type = seller_tag.text.strip()
159 else:
160 # Strategie 2: Kontextbasierte Suche im Vendor-Bereich
161 vendor_section = soup.find("div", id="vendor-and-cta-section") or soup.find(class_=lambda x: x and "VendorData" in x)
162 if vendor_section:
163 vendor_tag = vendor_section.find("span", class_="scr-tag")
164 seller_type = _safe_get_text(vendor_tag)
165
166 car_dict["seller_type"] = seller_type if seller_type else "Not found"
167 car_dict["is_dealer"] = (seller_type == "Händler") if seller_type else "Not found"
168 car_dict["is_private"] = (seller_type == "Privat") if seller_type else "Not found"
169
170 # leasing section
171 leasing_data = {}
172 try:
173 leasing_section = soup.find("div", {"data-cy": "leasing-section"}) or soup.find("section", id="leasing-section")
174
175 if leasing_section:
176 # Allgemeine Leasing-Daten
177 leasing_tags = leasing_section.find_all("dl", class_=lambda x: x and "DataGrid_asTable" in x)
178 for dl in leasing_tags:
179 for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
180 key = _sanitize_string(dt.get_text(strip=True))
181 value = dd.get_text(strip=True)
182 leasing_data[key] = value
183
184 # Wichtige Kennzahlen
185 monthly_rate_text = leasing_section.find(text=re.compile(r"Monatliche Rate", re.IGNORECASE))
186 if monthly_rate_text and monthly_rate_text.find_next("dd"):
187 leasing_data["monthly_rate"] = _parse_currency(monthly_rate_text.find_next("dd").text)
188
189 # Target Group aus dem SCR-Tag
190 target_group_tag = leasing_section.find("span", class_="scr-tag")
191 if target_group_tag:
192 leasing_data["target_group"] = [x.strip() for x in target_group_tag.text.split("/")]
193
194 # Raw-Daten für Debugging
195 car_dict["leasing_raw"] = json.dumps(leasing_data)
196
197 # Normalisierte Felder
198 car_dict["leasing_monthly"] = leasing_data.get("monthly_rate")
199 car_dict["leasing_total"] = _parse_currency(leasing_data.get("leasinggesamtbetrag"))
200 car_dict["leasing_contract_type"] = leasing_data.get("Vertragsart")
201 car_dict["leasing_mileage"] = leasing_data.get("Fahrleistung p.a.")
202 except Exception as e:
203 logging.warning(f"Fehler beim Leasing-Extract: {str(e)}")
204 car_dict["leasing_error"] = str(e)
205
206 # location
207 location_a = soup.find("a", class_="scr-link Department_link__xMUEe")
208 if location_a:
209 maps_link = location_a.get("href", "")
210 address_text = location_a.get_text(separator=" ", strip=True) # ersetzt <br> durch Leerzeichen
211 address_clean = re.sub(r'\s+,', ',', address_text)
212 car_dict["address"] = address_clean if address_clean else None
213 car_dict["maps_link"] = maps_link
214 else:
215 car_dict["address"] = None
216 car_dict["maps_link"] = None
217
218 # price
219 price_raw, additional_text, price_rating = _scrape_price(soup)
220 car_dict["price_raw"] = price_raw
221 # leasing info
222 car_dict["is_leasing_only"] = False
223 if price_raw and price_raw != "Not found":
224 # Case-insensitive Suche nach Leasing-Indikatoren
225 if re.search(r'(?i)\b(leasing|mtl?\.?)\b', price_raw):
226 car_dict["is_leasing_only"] = True
227
228 # Add numeric price
229 if price_raw and price_raw != "Not found":
230 price_numbers = "".join(re.findall(r'[0-9]+', price_raw))
231 car_dict["price"] = price_numbers
232 else:
233 car_dict["price"] = None
234 car_dict["price_text"] = additional_text
235 car_dict["price_rating"] = price_rating
236
237 # Extract equipment list
238 ausstattung_set = set()
239 equip_divs = soup.find_all("div", attrs={"class": re.compile(r"cldt-equipment-block")})
240 for div in equip_divs:
241 text = div.get_text(separator="\n", strip=True)
242 for line in text.split("\n"):
243 if line:
244 ausstattung_set.add(line)
245 car_dict["ausstattung_liste"] = sorted(ausstattung_set)
246
247 car_dict["date"] = datetime.now().isoformat()
248 car_dict["listing_hash"] = _get_data_hash(car_dict)
249
250 return car_dict
251 except Exception as e:
252 logging.error(f"Error processing {full_url}: {str(e)}")
253 return None
254
255
256async def process_car_listings(actor_input):
257 """Main processing logic adapted for Apify"""
258 async with AsyncClient() as client:
259 config = ApifyScraperConfig(actor_input)
260 visited_urls = await Actor.get_value('visited-urls') or set()
261
262 for country, country_code in config.countries.items():
263 for page in range(1, config.max_page + 1):
264 # Remove await from logging
265 Actor.log.info(f"Processing page {page} for {country}")
266 a_tags = await fetch_overview_page(country_code, page, config.base_url, client)
267 car_urls = {link["href"] for link in a_tags if "/angebote/" in link.get("href", "")}
268
269 new_urls = car_urls - visited_urls
270 if not new_urls:
271 break
272
273 for url in new_urls:
274 car_data = await fetch_car_details(url, config.base_url, client)
275 if car_data:
276 await Actor.push_data(car_data)
277 visited_urls.add(url)
278 await Actor.set_value('visited-urls', visited_urls)
279
280 # Use proper async sleep
281 await asyncio.sleep(random.uniform(*config.delay_range))
282
283
284async def main() -> None:
285 async with Actor:
286 actor_input = await Actor.get_input() or {}
287
288 # Initialize visited URLs dataset
289 await Actor.open_dataset(name='visited-urls')
290
291 # Run main processing
292 await process_car_listings(actor_input)

src/py.typed

.dockerignore

.git
.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5beautifulsoup4[lxml]
6httpx
7types-beautifulsoup4
8apify-client
9beautifulsoup4
10httpx
11tenacity
12python-dateutil