Sold Realestate Au
Deprecated
Pricing
Pay per usage
Go to Store
Sold Realestate Au
Deprecated
Sold
0.0 (0)
Pricing
Pay per usage
0
Total users
1
Monthly users
1
Last modified
a month ago
.actor/Dockerfile
# First, specify the base Docker image.# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.# You can also use any other image from Docker Hub.FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,# since it should be the only file that affects the dependency install in the next step,# in order to speed up the buildCOPY requirements.txt ./
# Install the packages specified in requirements.txt,# Print the installed Python version, pip version# and all installed packages with their versions for debuggingRUN echo "Python version:" \ && python --version \ && echo "Pip version:" \ && pip --version \ && echo "Installing dependencies:" \ && pip install -r requirements.txt \ && echo "All installed Python packages:" \ && pip freeze
# Next, copy the remaining files and directories with the source code.# Since we do this after installing the dependencies, quick build will be really fast# for most source file changes.COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.RUN python3 -m compileall -q .
# Create and run as a non-root user.RUN useradd --create-home apify && \ chown -R apify:apify ./USER apify
# Specify how to launch the source code of your Actor.# By default, the "python3 -m src" command is runCMD ["python3", "-m", "src"]
.actor/actor.json
{ "actorSpecification": 1, "name": "sold-realestate-au", "title": "Sold Realestate_au Scraper", "description": "Sold Realestate_au Scraper ", "version": "0.0", "buildTag": "latest", "meta": { "templateId": "python-start" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "Sold Realestate_au Scraper", "type": "object", "schemaVersion": 1, "properties": { "address": { "title": "Put you address here", "type": "string", "description": "Address to scrape", "editor": "textfield" } }, "required": ["address"]}
src/__init__.py
1
src/__main__.py
1import asyncio2
3from .main import main4
5# Execute the Actor entry point.6asyncio.run(main())
src/all_config.py
1API_KEY = '7uYsiEQtr721LUrV7jghPBbgrzDC1uRDgTVYQHS5m9feJFfyv4RxZT70Hq2o'2SITEMAP_ID = 12998713SEARCH_TYPE = 'sold'
src/main.py
1from apify import Actor2from httpx import AsyncClient3from time import sleep4from .utils import (5 extract_data_lists,6 generate_url7)8
9from .webscraper_ai import (10 start_scraping,11 get_scraping_status_by_job,12 get_scraping_results13)14
15from .all_config import SEARCH_TYPE16
17
18async def main() -> None:19 async with Actor:20 # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json.21 actor_input = await Actor.get_input() or {}22 address = actor_input.get('address')23 gen_url = generate_url(address, search_type=SEARCH_TYPE)24
25 # {'success': True, 'data': {'id': 28731019, 'custom_id': None}}26 start_scraping_st = start_scraping(url_=gen_url)27 if start_scraping_st.get('success'):28 Actor.log.info(f'Started scraping process on webscraper_io')29 else:30 Actor.log.error(31 f'Failed to start scraping process on webscraper_io')32 return33
34 job_id = start_scraping_st.get('data', {}).get('id', None)35 if job_id:36 Actor.log.info(37 f'Started successfully with job ID: {job_id} | webscraper_io')38
39 # Check the status of the scraping job until it is finished.40 while True:41 job_status = get_scraping_status_by_job(job_id)42 info_status = job_status.get('data', {}).get('status', None)43 sleep(10)44 Actor.log.info(f'Job status: {info_status}')45 if info_status == 'finished':46 Actor.log.info(f'Scraping job has finished')47 break48 if info_status == 'stopped':49 Actor.log.error(f'Scraping job has been stopped')50 break51
52 check_after_status = get_scraping_status_by_job(job_id)53 check_after_status = check_after_status.get(54 'data', {}).get('status', None)55
56 # if 'finished' == str(check_after_status) or 'stopped' == str(check_after_status):57 # Actor.log.error(f'Scraping job did not finish successfully')58 # return59
60 # Retrieve the scraping results from the job.61 scraping_results = get_scraping_results(job_id)62 if scraping_results:63 Actor.log.info(f'Found Scraping results:')64
65 # Extract the data from the scraping results.66 Actor.log.info(f'Extracting data from the scraping results')67 data_lists = extract_data_lists(scraping_results, SEARCH_TYPE)68 Actor.log.info(f'Total items: {len(data_lists)}')69 if data_lists:70 for item in data_lists:71 Actor.log.info(72 f'---------------------------------------------')73 Actor.log.info(f'Extracted data: {item}')74 Actor.log.info(75 f'---------------------------------------------')76 await Actor.push_data(item)77
78 else:79 Actor.log.error(f'Failed to retrieve scraping results')80 return81
82 else:83 Actor.log.error(f'Failed to retrieve job ID')84 return
src/py.typed
src/utils.py
1from parsel.selector import Selector2
3
4def extract_data_lists(driver_source, search_type="buy"):5 items = []6
7 try:8
9 response = Selector(text=str(driver_source))10 properties = response.xpath(11 '//*[contains(@class, "PropertyCardLayout")]')12 for prop in properties:13 branding_title = ''14 agent_name = ''15 agent_img = ''16 property_img = ''17 address_heading = ''18 added = ''19 price = ''20 inspection = ''21 property_type = ''22 property_details = ''23 sold_status = ''24 sold_on = ''25
26 item = {}27 '----------------------------------------------------------------'28 prop_content_details = prop.xpath(29 './/*[contains(@class, "residential-card__content-wrapper")]')30 prop_content_upper = prop.xpath(31 './/*[contains(@class, "residential-card__banner-strip")]')32
33 '----------------------------------------------------------------'34 address_heading = prop_content_details.xpath(35 './/*[contains(@class, "residential-card__address-heading")]//text()').extract_first()36 residential_card_primary = prop_content_details.xpath(37 './/*[contains(@class, "residential-card__primary")]')38
39 price = prop_content_details.xpath(40 './/*[contains(@class, "property-price")]//text()').extract_first()41 property_type = residential_card_primary.xpath(42 './p//text()').extract_first()43 property_details = residential_card_primary.xpath(44 './/li//@aria-label').extract()45 property_details = ", ".join(property_details)46
47 added = prop_content_upper.xpath(48 './/*[contains(@class, "styles__Content")]//text()').extract()49 added = [ad for ad in added if "Added" in str(ad)]50 added = "".join(added)51
52 inspection = prop_content_details.xpath(53 './/*[contains(@class, "inspection__long-label")]//text()').extract_first()54 branding_title = prop.xpath(55 './/*[contains(@class, "branding__image")]//@alt').extract_first()56 agent_name = prop.xpath(57 './/*[contains(@class, "agent__name")]//text()').extract_first()58 agent_img = prop.xpath(59 './/*[contains(@class, "agent__name")]//following-sibling::img//@src').extract_first()60
61 property_img = prop.xpath(62 './/*[@class="property-image"]//@data-url').extract_first()63
64 sold_status = prop_content_upper.xpath(65 './/span[contains(@class, "styles__Content")]//text()').extract_first()66 sold_on = prop_content_details.xpath(67 './/span[contains(text(), "Sold on")]/text()').extract_first()68
69 '----------------------------------------------------------------'70 item['address_heading'] = address_heading71 item['price'] = price72 if search_type == "buy":73 item['added'] = added74 item['inspection'] = inspection75 if search_type == "sold":76 item['sold_status'] = sold_status77 item['sold_on'] = sold_on78 item['property_img'] = property_img79 item['property_type'] = property_type80 item['property_details'] = property_details81 item['branding_title'] = branding_title82 item['agent_name'] = agent_name83 item['agent_img'] = agent_img84 item['search_type'] = search_type85
86 items.append(item)87
88 except Exception as e:89 print(f"Error: {e}")90 return items91
92 return items93
94
95def generate_url(input_address, search_type="buy", page=1):96 correct_address = input_address.replace(" ", "+")97 url = f"https://www.realestate.com.au/{search_type}/in-{correct_address}/list-{page}?includeSurrounding=false&activeSort=solddate"98 return url99
100
101def properties_pages_count(driver_source):102 try:103 response_ = Selector(text=str(driver_source))104 return int(int(response_.xpath('//*[contains(@class, "StyledResultsCount")]//text()').extract_first().split()[-2])/25)105 except Exception as e:106 print(f"Error: {e}")107 return 0
src/webscraper_ai.py
1import json2import httpx3from .all_config import (4 API_KEY, SITEMAP_ID5)6
7BASE_URL = "https://api.webscraper.io/api/v1/scraping-job"8
9
10def start_scraping(sitemap_id=SITEMAP_ID, url_=None):11 """Start a scraping job for a given sitemap ID."""12 url = f"{BASE_URL}?api_token={API_KEY}" # Ensure API token is included in the URL13
14 headers = {"Content-Type": "application/json",15 "Accept": "application/json"}16 payload = {17 "sitemap_id": sitemap_id,18 "driver": "fulljs",19 "page_load_delay": 2000,20 "request_interval": 1000,21 "proxy": 1,22 "start_urls": [23 url_24 ],25 }26
27 with httpx.Client() as client:28 # Use json=payload instead of data=29 response = client.post(url, headers=headers, json=payload)30
31 if response.status_code == 200:32 return response.json()33 else:34 print(35 f"Error starting scraping: {response.status_code} - {response.text}")36 return {}37
38# start_scrape: {'success': True, 'data': {'id': 28731019, 'custom_id': None}}39
40
41def get_scraping_status_by_job(job_id):42 """Fetches the status of a scraping job."""43 url = f"{BASE_URL}/{job_id}?api_token={API_KEY}" # Job status endpoint44
45 with httpx.Client() as client:46 response = client.get(url)47
48 if response.status_code == 200:49 return response.json()50 else:51 print(52 f"Error fetching status: {response.status_code} - {response.text}")53 return {}54
55
56# Scraping Job Status:57# {'success': True, 'data': {'id': 28731019, 'custom_id': None, 'sitemap_id': 1274721, 'status': 'started',58# 'test_run': 0, 'sitemap_name': 'realestate_au',59# 'time_created': 1740759317, 'jobs_scheduled': 2,60# 'jobs_executed': 29, 'jobs_failed': 0,61# 'stored_record_count': 29, 'request_interval': 1000,62# 'page_load_delay': 2000, 'driver': 'fulljs',63# 'jobs_empty': 0, 'jobs_no_value': 0, 'scraping_duration': 207,64# 'scheduled': 0}}65
66
67def get_scraping_results(job_id):68 """Fetches the scraped data of a completed job."""69 url = f"{BASE_URL}/{job_id}/json?api_token={API_KEY}" # Results endpoint70
71 with httpx.Client() as client:72 response = client.get(url)73
74 if response.status_code == 200:75 try:76 # Capture the raw response text77 data = response.text78 print("Raw Response Text:")79 # print(data) # Print the raw response text80
81 # Process each line in the response as an individual JSON object82 results = []83 for line in data.splitlines():84 try:85 json_data = json.loads(line) # Parse each line as JSON86 results.append(json_data)87 except json.JSONDecodeError:88 print(f"Error decoding line: {line}")89 continue90
91 print("Parsed JSON Data:")92 # Print the parsed JSON data in a readable format93 # print(json.dumps(results, indent=4))94
95 # Save the parsed data to a file96 with open("scraped_data.json", "w") as json_file:97 json.dump(results, json_file, indent=4)98
99 return results100 except Exception as e:101 print(f"Error processing response: {e}")102 return {}103
104 else:105 print(f"Error fetching results: {response.status_code}")106 return {}107
108
109"""110Method: POST111URL: https://api.webscraper.io/api/v1/scraping-job?api_token=<YOUR API TOKEN>112JSON:113{114 "sitemap_id": 123,115 "driver": "fast", // "fast" or "fulljs"116 "page_load_delay": 2000,117 "request_interval": 2000,118 "proxy": 0, // 0: No proxy, 1: Use proxy, 123: Custom proxy id, 'residential-*': Use residential proxy, replace * with country code, for example, 'residential-us'119 "start_urls": [ // optional, if set, will overwrite sitemap start URLs120 "https://www.webscraper.io/test-sites/e-commerce/allinone/computers",121 "https://www.webscraper.io/test-sites/e-commerce/allinone/phones"122 ],123 "custom_id": "custom-scraping-job-12" // optional, will be included in webhook notification124}125
126"""
.dockerignore
.git.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
.gitignore
.mise.toml.nvim.luastorage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files__pycache__/*.py[cod]*$py.class
# C extensions*.so
# Distribution / packaging.Pythonbuild/develop-eggs/dist/downloads/eggs/.eggs/lib/lib64/parts/sdist/var/wheels/share/python-wheels/*.egg-info/.installed.cfg*.eggMANIFEST
# PyInstaller# Usually these files are written by a python script from a template# before PyInstaller builds the exe, so as to inject date/other infos into it.*.manifest*.spec
# Installer logspip-log.txtpip-delete-this-directory.txt
# Unit test / coverage reportshtmlcov/.tox/.nox/.coverage.coverage.*.cachenosetests.xmlcoverage.xml*.cover*.py,cover.hypothesis/.pytest_cache/cover/
# Translations*.mo*.pot
# Django stuff:*.loglocal_settings.pydb.sqlite3db.sqlite3-journal
# Flask stuff:instance/.webassets-cache
# Scrapy stuff:.scrapy
# Sphinx documentationdocs/_build/
# PyBuilder.pybuilder/target/
# Jupyter Notebook.ipynb_checkpoints
# IPythonprofile_default/ipython_config.py
# pyenv# For a library or package, you might want to ignore these files since the code is# intended to run in multiple environments; otherwise, check them in:.python-version
# pdm# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.#pdm.lock# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it# in version control.# https://pdm.fming.dev/latest/usage/project/#working-with-version-control.pdm.toml.pdm-python.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm__pypackages__/
# Celery stuffcelerybeat-schedulecelerybeat.pid
# SageMath parsed files*.sage.py
# Environments.env.venvenv/venv/ENV/env.bak/venv.bak/
# Spyder project settings.spyderproject.spyproject
# Rope project settings.ropeproject
# mkdocs documentation/site
# mypy.mypy_cache/.dmypy.jsondmypy.json
# Pyre type checker.pyre/
# pytype static type analyzer.pytype/
# Cython debug symbolscython_debug/
# PyCharm# JetBrains specific template is maintained in a separate JetBrains.gitignore that can# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore# and can be added to the global gitignore or merged into this file. For a more nuclear# option (not recommended) you can uncomment the following to ignore the entire idea folder..idea/
# Added by Apify CLInode_modules
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:2# https://pip.pypa.io/en/latest/reference/requirements-file-format/3
4apify < 3.05parsel6httpx