Sold Realestate Au avatar
Sold Realestate Au

Deprecated

Pricing

Pay per usage

Go to Store
Sold Realestate Au

Sold Realestate Au

Deprecated

Developed by

Maurice

Maurice

Maintained by Community

Sold

0.0 (0)

Pricing

Pay per usage

0

Total users

1

Monthly users

1

Last modified

a month ago

.actor/Dockerfile

# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
FROM apify/actor-python:3.13
# Second, copy just requirements.txt into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY requirements.txt ./
# Install the packages specified in requirements.txt,
# Print the installed Python version, pip version
# and all installed packages with their versions for debugging
RUN echo "Python version:" \
&& python --version \
&& echo "Pip version:" \
&& pip --version \
&& echo "Installing dependencies:" \
&& pip install -r requirements.txt \
&& echo "All installed Python packages:" \
&& pip freeze
# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./
# Use compileall to ensure the runnability of the Actor Python code.
RUN python3 -m compileall -q .
# Create and run as a non-root user.
RUN useradd --create-home apify && \
chown -R apify:apify ./
USER apify
# Specify how to launch the source code of your Actor.
# By default, the "python3 -m src" command is run
CMD ["python3", "-m", "src"]

.actor/actor.json

{
"actorSpecification": 1,
"name": "sold-realestate-au",
"title": "Sold Realestate_au Scraper",
"description": "Sold Realestate_au Scraper ",
"version": "0.0",
"buildTag": "latest",
"meta": {
"templateId": "python-start"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "Sold Realestate_au Scraper",
"type": "object",
"schemaVersion": 1,
"properties": {
"address": {
"title": "Put you address here",
"type": "string",
"description": "Address to scrape",
"editor": "textfield"
}
},
"required": ["address"]
}

src/__init__.py

1

src/__main__.py

1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())

src/all_config.py

1API_KEY = '7uYsiEQtr721LUrV7jghPBbgrzDC1uRDgTVYQHS5m9feJFfyv4RxZT70Hq2o'
2SITEMAP_ID = 1299871
3SEARCH_TYPE = 'sold'

src/main.py

1from apify import Actor
2from httpx import AsyncClient
3from time import sleep
4from .utils import (
5 extract_data_lists,
6 generate_url
7)
8
9from .webscraper_ai import (
10 start_scraping,
11 get_scraping_status_by_job,
12 get_scraping_results
13)
14
15from .all_config import SEARCH_TYPE
16
17
18async def main() -> None:
19 async with Actor:
20 # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json.
21 actor_input = await Actor.get_input() or {}
22 address = actor_input.get('address')
23 gen_url = generate_url(address, search_type=SEARCH_TYPE)
24
25 # {'success': True, 'data': {'id': 28731019, 'custom_id': None}}
26 start_scraping_st = start_scraping(url_=gen_url)
27 if start_scraping_st.get('success'):
28 Actor.log.info(f'Started scraping process on webscraper_io')
29 else:
30 Actor.log.error(
31 f'Failed to start scraping process on webscraper_io')
32 return
33
34 job_id = start_scraping_st.get('data', {}).get('id', None)
35 if job_id:
36 Actor.log.info(
37 f'Started successfully with job ID: {job_id} | webscraper_io')
38
39 # Check the status of the scraping job until it is finished.
40 while True:
41 job_status = get_scraping_status_by_job(job_id)
42 info_status = job_status.get('data', {}).get('status', None)
43 sleep(10)
44 Actor.log.info(f'Job status: {info_status}')
45 if info_status == 'finished':
46 Actor.log.info(f'Scraping job has finished')
47 break
48 if info_status == 'stopped':
49 Actor.log.error(f'Scraping job has been stopped')
50 break
51
52 check_after_status = get_scraping_status_by_job(job_id)
53 check_after_status = check_after_status.get(
54 'data', {}).get('status', None)
55
56 # if 'finished' == str(check_after_status) or 'stopped' == str(check_after_status):
57 # Actor.log.error(f'Scraping job did not finish successfully')
58 # return
59
60 # Retrieve the scraping results from the job.
61 scraping_results = get_scraping_results(job_id)
62 if scraping_results:
63 Actor.log.info(f'Found Scraping results:')
64
65 # Extract the data from the scraping results.
66 Actor.log.info(f'Extracting data from the scraping results')
67 data_lists = extract_data_lists(scraping_results, SEARCH_TYPE)
68 Actor.log.info(f'Total items: {len(data_lists)}')
69 if data_lists:
70 for item in data_lists:
71 Actor.log.info(
72 f'---------------------------------------------')
73 Actor.log.info(f'Extracted data: {item}')
74 Actor.log.info(
75 f'---------------------------------------------')
76 await Actor.push_data(item)
77
78 else:
79 Actor.log.error(f'Failed to retrieve scraping results')
80 return
81
82 else:
83 Actor.log.error(f'Failed to retrieve job ID')
84 return

src/py.typed

src/utils.py

1from parsel.selector import Selector
2
3
4def extract_data_lists(driver_source, search_type="buy"):
5 items = []
6
7 try:
8
9 response = Selector(text=str(driver_source))
10 properties = response.xpath(
11 '//*[contains(@class, "PropertyCardLayout")]')
12 for prop in properties:
13 branding_title = ''
14 agent_name = ''
15 agent_img = ''
16 property_img = ''
17 address_heading = ''
18 added = ''
19 price = ''
20 inspection = ''
21 property_type = ''
22 property_details = ''
23 sold_status = ''
24 sold_on = ''
25
26 item = {}
27 '----------------------------------------------------------------'
28 prop_content_details = prop.xpath(
29 './/*[contains(@class, "residential-card__content-wrapper")]')
30 prop_content_upper = prop.xpath(
31 './/*[contains(@class, "residential-card__banner-strip")]')
32
33 '----------------------------------------------------------------'
34 address_heading = prop_content_details.xpath(
35 './/*[contains(@class, "residential-card__address-heading")]//text()').extract_first()
36 residential_card_primary = prop_content_details.xpath(
37 './/*[contains(@class, "residential-card__primary")]')
38
39 price = prop_content_details.xpath(
40 './/*[contains(@class, "property-price")]//text()').extract_first()
41 property_type = residential_card_primary.xpath(
42 './p//text()').extract_first()
43 property_details = residential_card_primary.xpath(
44 './/li//@aria-label').extract()
45 property_details = ", ".join(property_details)
46
47 added = prop_content_upper.xpath(
48 './/*[contains(@class, "styles__Content")]//text()').extract()
49 added = [ad for ad in added if "Added" in str(ad)]
50 added = "".join(added)
51
52 inspection = prop_content_details.xpath(
53 './/*[contains(@class, "inspection__long-label")]//text()').extract_first()
54 branding_title = prop.xpath(
55 './/*[contains(@class, "branding__image")]//@alt').extract_first()
56 agent_name = prop.xpath(
57 './/*[contains(@class, "agent__name")]//text()').extract_first()
58 agent_img = prop.xpath(
59 './/*[contains(@class, "agent__name")]//following-sibling::img//@src').extract_first()
60
61 property_img = prop.xpath(
62 './/*[@class="property-image"]//@data-url').extract_first()
63
64 sold_status = prop_content_upper.xpath(
65 './/span[contains(@class, "styles__Content")]//text()').extract_first()
66 sold_on = prop_content_details.xpath(
67 './/span[contains(text(), "Sold on")]/text()').extract_first()
68
69 '----------------------------------------------------------------'
70 item['address_heading'] = address_heading
71 item['price'] = price
72 if search_type == "buy":
73 item['added'] = added
74 item['inspection'] = inspection
75 if search_type == "sold":
76 item['sold_status'] = sold_status
77 item['sold_on'] = sold_on
78 item['property_img'] = property_img
79 item['property_type'] = property_type
80 item['property_details'] = property_details
81 item['branding_title'] = branding_title
82 item['agent_name'] = agent_name
83 item['agent_img'] = agent_img
84 item['search_type'] = search_type
85
86 items.append(item)
87
88 except Exception as e:
89 print(f"Error: {e}")
90 return items
91
92 return items
93
94
95def generate_url(input_address, search_type="buy", page=1):
96 correct_address = input_address.replace(" ", "+")
97 url = f"https://www.realestate.com.au/{search_type}/in-{correct_address}/list-{page}?includeSurrounding=false&activeSort=solddate"
98 return url
99
100
101def properties_pages_count(driver_source):
102 try:
103 response_ = Selector(text=str(driver_source))
104 return int(int(response_.xpath('//*[contains(@class, "StyledResultsCount")]//text()').extract_first().split()[-2])/25)
105 except Exception as e:
106 print(f"Error: {e}")
107 return 0

src/webscraper_ai.py

1import json
2import httpx
3from .all_config import (
4 API_KEY, SITEMAP_ID
5)
6
7BASE_URL = "https://api.webscraper.io/api/v1/scraping-job"
8
9
10def start_scraping(sitemap_id=SITEMAP_ID, url_=None):
11 """Start a scraping job for a given sitemap ID."""
12 url = f"{BASE_URL}?api_token={API_KEY}" # Ensure API token is included in the URL
13
14 headers = {"Content-Type": "application/json",
15 "Accept": "application/json"}
16 payload = {
17 "sitemap_id": sitemap_id,
18 "driver": "fulljs",
19 "page_load_delay": 2000,
20 "request_interval": 1000,
21 "proxy": 1,
22 "start_urls": [
23 url_
24 ],
25 }
26
27 with httpx.Client() as client:
28 # Use json=payload instead of data=
29 response = client.post(url, headers=headers, json=payload)
30
31 if response.status_code == 200:
32 return response.json()
33 else:
34 print(
35 f"Error starting scraping: {response.status_code} - {response.text}")
36 return {}
37
38# start_scrape: {'success': True, 'data': {'id': 28731019, 'custom_id': None}}
39
40
41def get_scraping_status_by_job(job_id):
42 """Fetches the status of a scraping job."""
43 url = f"{BASE_URL}/{job_id}?api_token={API_KEY}" # Job status endpoint
44
45 with httpx.Client() as client:
46 response = client.get(url)
47
48 if response.status_code == 200:
49 return response.json()
50 else:
51 print(
52 f"Error fetching status: {response.status_code} - {response.text}")
53 return {}
54
55
56# Scraping Job Status:
57# {'success': True, 'data': {'id': 28731019, 'custom_id': None, 'sitemap_id': 1274721, 'status': 'started',
58# 'test_run': 0, 'sitemap_name': 'realestate_au',
59# 'time_created': 1740759317, 'jobs_scheduled': 2,
60# 'jobs_executed': 29, 'jobs_failed': 0,
61# 'stored_record_count': 29, 'request_interval': 1000,
62# 'page_load_delay': 2000, 'driver': 'fulljs',
63# 'jobs_empty': 0, 'jobs_no_value': 0, 'scraping_duration': 207,
64# 'scheduled': 0}}
65
66
67def get_scraping_results(job_id):
68 """Fetches the scraped data of a completed job."""
69 url = f"{BASE_URL}/{job_id}/json?api_token={API_KEY}" # Results endpoint
70
71 with httpx.Client() as client:
72 response = client.get(url)
73
74 if response.status_code == 200:
75 try:
76 # Capture the raw response text
77 data = response.text
78 print("Raw Response Text:")
79 # print(data) # Print the raw response text
80
81 # Process each line in the response as an individual JSON object
82 results = []
83 for line in data.splitlines():
84 try:
85 json_data = json.loads(line) # Parse each line as JSON
86 results.append(json_data)
87 except json.JSONDecodeError:
88 print(f"Error decoding line: {line}")
89 continue
90
91 print("Parsed JSON Data:")
92 # Print the parsed JSON data in a readable format
93 # print(json.dumps(results, indent=4))
94
95 # Save the parsed data to a file
96 with open("scraped_data.json", "w") as json_file:
97 json.dump(results, json_file, indent=4)
98
99 return results
100 except Exception as e:
101 print(f"Error processing response: {e}")
102 return {}
103
104 else:
105 print(f"Error fetching results: {response.status_code}")
106 return {}
107
108
109"""
110Method: POST
111URL: https://api.webscraper.io/api/v1/scraping-job?api_token=<YOUR API TOKEN>
112JSON:
113{
114 "sitemap_id": 123,
115 "driver": "fast", // "fast" or "fulljs"
116 "page_load_delay": 2000,
117 "request_interval": 2000,
118 "proxy": 0, // 0: No proxy, 1: Use proxy, 123: Custom proxy id, 'residential-*': Use residential proxy, replace * with country code, for example, 'residential-us'
119 "start_urls": [ // optional, if set, will overwrite sitemap start URLs
120 "https://www.webscraper.io/test-sites/e-commerce/allinone/computers",
121 "https://www.webscraper.io/test-sites/e-commerce/allinone/phones"
122 ],
123 "custom_id": "custom-scraping-job-12" // optional, will be included in webhook notification
124}
125
126"""

.dockerignore

.git
.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.gitignore

.mise.toml
.nvim.lua
storage
# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Added by Apify CLI
node_modules

requirements.txt

1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5parsel
6httpx