Buy Realestate Au
Deprecated
Pricing
Pay per usage
Go to Store
Buy Realestate Au
Deprecated
0.0 (0)
Pricing
Pay per usage
0
Monthly users
1
Last modified
2 days ago
.actor/Dockerfile
1# First, specify the base Docker image.
2# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
3# You can also use any other image from Docker Hub.
4FROM apify/actor-python:3.13
5
6# Second, copy just requirements.txt into the Actor image,
7# since it should be the only file that affects the dependency install in the next step,
8# in order to speed up the build
9COPY requirements.txt ./
10
11# Install the packages specified in requirements.txt,
12# Print the installed Python version, pip version
13# and all installed packages with their versions for debugging
14RUN echo "Python version:" \
15 && python --version \
16 && echo "Pip version:" \
17 && pip --version \
18 && echo "Installing dependencies:" \
19 && pip install -r requirements.txt \
20 && echo "All installed Python packages:" \
21 && pip freeze
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after installing the dependencies, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28# Use compileall to ensure the runnability of the Actor Python code.
29RUN python3 -m compileall -q .
30
31# Create and run as a non-root user.
32RUN useradd --create-home apify && \
33 chown -R apify:apify ./
34USER apify
35
36# Specify how to launch the source code of your Actor.
37# By default, the "python3 -m src" command is run
38CMD ["python3", "-m", "src"]
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "buy-realestate-au",
4 "title": "buy-realestate-au Scraper with Python",
5 "description": "Scrape Items from buy-realestate-au",
6 "version": "0.0",
7 "buildTag": "latest",
8 "meta": {
9 "templateId": "python-start"
10 },
11 "input": "./input_schema.json",
12 "dockerfile": "./Dockerfile"
13}
.actor/input_schema.json
1{
2 "title": "Buy Realestate_au Scraper",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "address": {
7 "title": "Put you address here",
8 "type": "string",
9 "description": "Address to scrape",
10 "editor": "textfield"
11
12 }
13 },
14 "required": ["address"]
15}
src/__init__.py
src/__main__.py
1import asyncio
2
3from .main import main
4
5# Execute the Actor entry point.
6asyncio.run(main())
src/all_config.py
1API_KEY = '7uYsiEQtr721LUrV7jghPBbgrzDC1uRDgTVYQHS5m9feJFfyv4RxZT70Hq2o'
2SITEMAP_ID = 1298618
3SEARCH_TYPE = 'buy'
src/main.py
1from apify import Actor
2from httpx import AsyncClient
3from time import sleep
4from .utils import (
5 extract_data_lists,
6 generate_url
7)
8
9from .webscraper_ai import (
10 start_scraping,
11 get_scraping_status_by_job,
12 get_scraping_results
13)
14
15from .all_config import SEARCH_TYPE
16
17
18async def main() -> None:
19 async with Actor:
20 # Retrieve the input object for the Actor. The structure of input is defined in input_schema.json.
21 actor_input = await Actor.get_input() or {}
22 address = actor_input.get('address')
23 gen_url = generate_url(address, search_type=SEARCH_TYPE)
24
25 # {'success': True, 'data': {'id': 28731019, 'custom_id': None}}
26 start_scraping_st = start_scraping(url_=gen_url)
27 if start_scraping_st.get('success'):
28 Actor.log.info(f'Started scraping process on webscraper_io')
29 else:
30 Actor.log.error(
31 f'Failed to start scraping process on webscraper_io')
32 return
33
34 job_id = start_scraping_st.get('data', {}).get('id', None)
35 if job_id:
36 Actor.log.info(
37 f'Started successfully with job ID: {job_id} | webscraper_io')
38
39 # Check the status of the scraping job until it is finished.
40 while True:
41 job_status = get_scraping_status_by_job(job_id)
42 info_status = job_status.get('data', {}).get('status', None)
43 sleep(10)
44 Actor.log.info(f'Job status: {info_status}')
45 if info_status == 'finished':
46 Actor.log.info(f'Scraping job has finished')
47 break
48 if info_status == 'stopped':
49 Actor.log.error(f'Scraping job has been stopped')
50 break
51
52 check_after_status = get_scraping_status_by_job(job_id)
53 check_after_status = check_after_status.get(
54 'data', {}).get('status', None)
55
56 # if 'finished' == str(check_after_status) or 'stopped' == str(check_after_status):
57 # Actor.log.error(f'Scraping job did not finish successfully')
58 # return
59
60 # Retrieve the scraping results from the job.
61 scraping_results = get_scraping_results(job_id)
62 if scraping_results:
63 Actor.log.info(f'Found Scraping results:')
64
65 # Extract the data from the scraping results.
66 Actor.log.info(f'Extracting data from the scraping results')
67 data_lists = extract_data_lists(scraping_results, SEARCH_TYPE)
68 Actor.log.info(f'Total items: {len(data_lists)}')
69 if data_lists:
70 for item in data_lists:
71 Actor.log.info(
72 f'---------------------------------------------')
73 Actor.log.info(f'Extracted data: {item}')
74 Actor.log.info(
75 f'---------------------------------------------')
76 await Actor.push_data(item)
77
78 else:
79 Actor.log.error(f'Failed to retrieve scraping results')
80 return
81
82 else:
83 Actor.log.error(f'Failed to retrieve job ID')
84 return
src/py.typed
1
src/utils.py
1from parsel.selector import Selector
2
3
4def extract_data_lists(driver_source, search_type="buy"):
5 items = []
6
7 try:
8
9 response = Selector(text=str(driver_source))
10 properties = response.xpath(
11 '//*[contains(@class, "PropertyCardLayout")]')
12 for prop in properties:
13 branding_title = ''
14 agent_name = ''
15 agent_img = ''
16 property_img = ''
17 address_heading = ''
18 added = ''
19 price = ''
20 inspection = ''
21 property_type = ''
22 property_details = ''
23 sold_status = ''
24 sold_on = ''
25
26 item = {}
27 '----------------------------------------------------------------'
28 prop_content_details = prop.xpath(
29 './/*[contains(@class, "residential-card__content-wrapper")]')
30 prop_content_upper = prop.xpath(
31 './/*[contains(@class, "residential-card__banner-strip")]')
32
33 '----------------------------------------------------------------'
34 address_heading = prop_content_details.xpath(
35 './/*[contains(@class, "residential-card__address-heading")]//text()').extract_first()
36 residential_card_primary = prop_content_details.xpath(
37 './/*[contains(@class, "residential-card__primary")]')
38
39 price = prop_content_details.xpath(
40 './/*[contains(@class, "property-price")]//text()').extract_first()
41 property_type = residential_card_primary.xpath(
42 './p//text()').extract_first()
43 property_details = residential_card_primary.xpath(
44 './/li//@aria-label').extract()
45 property_details = ", ".join(property_details)
46
47 added = prop_content_upper.xpath(
48 './/*[contains(@class, "styles__Content")]//text()').extract()
49 added = [ad for ad in added if "Added" in str(ad)]
50 added = "".join(added)
51
52 inspection = prop_content_details.xpath(
53 './/*[contains(@class, "inspection__long-label")]//text()').extract_first()
54 branding_title = prop.xpath(
55 './/*[contains(@class, "branding__image")]//@alt').extract_first()
56 agent_name = prop.xpath(
57 './/*[contains(@class, "agent__name")]//text()').extract_first()
58 agent_img = prop.xpath(
59 './/*[contains(@class, "agent__name")]//following-sibling::img//@src').extract_first()
60
61 property_img = prop.xpath(
62 './/*[@class="property-image"]//@data-url').extract_first()
63
64 sold_status = prop_content_upper.xpath(
65 './/span[contains(@class, "styles__Content")]//text()').extract_first()
66 sold_on = prop_content_details.xpath(
67 './/span[contains(text(), "Sold on")]/text()').extract_first()
68
69 '----------------------------------------------------------------'
70 item['address_heading'] = address_heading
71 item['price'] = price
72 if search_type == "buy":
73 item['added'] = added
74 item['inspection'] = inspection
75 if search_type == "sold":
76 item['sold_status'] = sold_status
77 item['sold_on'] = sold_on
78 item['property_img'] = property_img
79 item['property_type'] = property_type
80 item['property_details'] = property_details
81 item['branding_title'] = branding_title
82 item['agent_name'] = agent_name
83 item['agent_img'] = agent_img
84 item['search_type'] = search_type
85
86 items.append(item)
87
88 except Exception as e:
89 print(f"Error: {e}")
90 return items
91
92 return items
93
94
95def generate_url(input_address, search_type="buy", page=1):
96 correct_address = input_address.replace(" ", "+")
97 url = f"https://www.realestate.com.au/{search_type}/in-{correct_address}/list-{page}"
98 return url
99
100
101def properties_pages_count(driver_source):
102 try:
103 response_ = Selector(text=str(driver_source))
104 return int(int(response_.xpath('//*[contains(@class, "StyledResultsCount")]//text()').extract_first().split()[-2])/25)
105 except Exception as e:
106 print(f"Error: {e}")
107 return 0
src/webscraper_ai.py
1import json
2import httpx
3from .all_config import (
4 API_KEY, SITEMAP_ID
5)
6
7BASE_URL = "https://api.webscraper.io/api/v1/scraping-job"
8
9
10def start_scraping(sitemap_id=SITEMAP_ID, url_=None):
11 """Start a scraping job for a given sitemap ID."""
12 url = f"{BASE_URL}?api_token={API_KEY}" # Ensure API token is included in the URL
13
14 headers = {"Content-Type": "application/json",
15 "Accept": "application/json"}
16 payload = {
17 "sitemap_id": sitemap_id,
18 "driver": "fulljs",
19 "page_load_delay": 2000,
20 "request_interval": 1000,
21 "proxy": 1,
22 "start_urls": [
23 url_
24 ],
25 }
26
27 with httpx.Client() as client:
28 # Use json=payload instead of data=
29 response = client.post(url, headers=headers, json=payload)
30
31 if response.status_code == 200:
32 return response.json()
33 else:
34 print(
35 f"Error starting scraping: {response.status_code} - {response.text}")
36 return {}
37
38# start_scrape: {'success': True, 'data': {'id': 28731019, 'custom_id': None}}
39
40
41def get_scraping_status_by_job(job_id):
42 """Fetches the status of a scraping job."""
43 url = f"{BASE_URL}/{job_id}?api_token={API_KEY}" # Job status endpoint
44
45 with httpx.Client() as client:
46 response = client.get(url)
47
48 if response.status_code == 200:
49 return response.json()
50 else:
51 print(
52 f"Error fetching status: {response.status_code} - {response.text}")
53 return {}
54
55
56# Scraping Job Status:
57# {'success': True, 'data': {'id': 28731019, 'custom_id': None, 'sitemap_id': 1274721, 'status': 'started',
58# 'test_run': 0, 'sitemap_name': 'realestate_au',
59# 'time_created': 1740759317, 'jobs_scheduled': 2,
60# 'jobs_executed': 29, 'jobs_failed': 0,
61# 'stored_record_count': 29, 'request_interval': 1000,
62# 'page_load_delay': 2000, 'driver': 'fulljs',
63# 'jobs_empty': 0, 'jobs_no_value': 0, 'scraping_duration': 207,
64# 'scheduled': 0}}
65
66
67def get_scraping_results(job_id):
68 """Fetches the scraped data of a completed job."""
69 url = f"{BASE_URL}/{job_id}/json?api_token={API_KEY}" # Results endpoint
70
71 with httpx.Client() as client:
72 response = client.get(url)
73
74 if response.status_code == 200:
75 try:
76 # Capture the raw response text
77 data = response.text
78 print("Raw Response Text:")
79 # print(data) # Print the raw response text
80
81 # Process each line in the response as an individual JSON object
82 results = []
83 for line in data.splitlines():
84 try:
85 json_data = json.loads(line) # Parse each line as JSON
86 results.append(json_data)
87 except json.JSONDecodeError:
88 print(f"Error decoding line: {line}")
89 continue
90
91 print("Parsed JSON Data:")
92 # Print the parsed JSON data in a readable format
93 # print(json.dumps(results, indent=4))
94
95 # Save the parsed data to a file
96 with open("scraped_data.json", "w") as json_file:
97 json.dump(results, json_file, indent=4)
98
99 return results
100 except Exception as e:
101 print(f"Error processing response: {e}")
102 return {}
103
104 else:
105 print(f"Error fetching results: {response.status_code}")
106 return {}
107
108
109"""
110Method: POST
111URL: https://api.webscraper.io/api/v1/scraping-job?api_token=<YOUR API TOKEN>
112JSON:
113{
114 "sitemap_id": 123,
115 "driver": "fast", // "fast" or "fulljs"
116 "page_load_delay": 2000,
117 "request_interval": 2000,
118 "proxy": 0, // 0: No proxy, 1: Use proxy, 123: Custom proxy id, 'residential-*': Use residential proxy, replace * with country code, for example, 'residential-us'
119 "start_urls": [ // optional, if set, will overwrite sitemap start URLs
120 "https://www.webscraper.io/test-sites/e-commerce/allinone/computers",
121 "https://www.webscraper.io/test-sites/e-commerce/allinone/phones"
122 ],
123 "custom_id": "custom-scraping-job-12" // optional, will be included in webhook notification
124}
125
126"""
.dockerignore
1.git
2.mise.toml
3.nvim.lua
4storage
5
6# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
7
8# Byte-compiled / optimized / DLL files
9__pycache__/
10*.py[cod]
11*$py.class
12
13# C extensions
14*.so
15
16# Distribution / packaging
17.Python
18build/
19develop-eggs/
20dist/
21downloads/
22eggs/
23.eggs/
24lib/
25lib64/
26parts/
27sdist/
28var/
29wheels/
30share/python-wheels/
31*.egg-info/
32.installed.cfg
33*.egg
34MANIFEST
35
36# PyInstaller
37# Usually these files are written by a python script from a template
38# before PyInstaller builds the exe, so as to inject date/other infos into it.
39*.manifest
40*.spec
41
42# Installer logs
43pip-log.txt
44pip-delete-this-directory.txt
45
46# Unit test / coverage reports
47htmlcov/
48.tox/
49.nox/
50.coverage
51.coverage.*
52.cache
53nosetests.xml
54coverage.xml
55*.cover
56*.py,cover
57.hypothesis/
58.pytest_cache/
59cover/
60
61# Translations
62*.mo
63*.pot
64
65# Django stuff:
66*.log
67local_settings.py
68db.sqlite3
69db.sqlite3-journal
70
71# Flask stuff:
72instance/
73.webassets-cache
74
75# Scrapy stuff:
76.scrapy
77
78# Sphinx documentation
79docs/_build/
80
81# PyBuilder
82.pybuilder/
83target/
84
85# Jupyter Notebook
86.ipynb_checkpoints
87
88# IPython
89profile_default/
90ipython_config.py
91
92# pyenv
93# For a library or package, you might want to ignore these files since the code is
94# intended to run in multiple environments; otherwise, check them in:
95.python-version
96
97# pdm
98# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
99#pdm.lock
100# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
101# in version control.
102# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
103.pdm.toml
104.pdm-python
105.pdm-build/
106
107# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
108__pypackages__/
109
110# Celery stuff
111celerybeat-schedule
112celerybeat.pid
113
114# SageMath parsed files
115*.sage.py
116
117# Environments
118.env
119.venv
120env/
121venv/
122ENV/
123env.bak/
124venv.bak/
125
126# Spyder project settings
127.spyderproject
128.spyproject
129
130# Rope project settings
131.ropeproject
132
133# mkdocs documentation
134/site
135
136# mypy
137.mypy_cache/
138.dmypy.json
139dmypy.json
140
141# Pyre type checker
142.pyre/
143
144# pytype static type analyzer
145.pytype/
146
147# Cython debug symbols
148cython_debug/
149
150# PyCharm
151# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
152# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153# and can be added to the global gitignore or merged into this file. For a more nuclear
154# option (not recommended) you can uncomment the following to ignore the entire idea folder.
155.idea/
.gitignore
1.mise.toml
2.nvim.lua
3storage
4
5# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore
6
7# Byte-compiled / optimized / DLL files
8__pycache__/
9*.py[cod]
10*$py.class
11
12# C extensions
13*.so
14
15# Distribution / packaging
16.Python
17build/
18develop-eggs/
19dist/
20downloads/
21eggs/
22.eggs/
23lib/
24lib64/
25parts/
26sdist/
27var/
28wheels/
29share/python-wheels/
30*.egg-info/
31.installed.cfg
32*.egg
33MANIFEST
34
35# PyInstaller
36# Usually these files are written by a python script from a template
37# before PyInstaller builds the exe, so as to inject date/other infos into it.
38*.manifest
39*.spec
40
41# Installer logs
42pip-log.txt
43pip-delete-this-directory.txt
44
45# Unit test / coverage reports
46htmlcov/
47.tox/
48.nox/
49.coverage
50.coverage.*
51.cache
52nosetests.xml
53coverage.xml
54*.cover
55*.py,cover
56.hypothesis/
57.pytest_cache/
58cover/
59
60# Translations
61*.mo
62*.pot
63
64# Django stuff:
65*.log
66local_settings.py
67db.sqlite3
68db.sqlite3-journal
69
70# Flask stuff:
71instance/
72.webassets-cache
73
74# Scrapy stuff:
75.scrapy
76
77# Sphinx documentation
78docs/_build/
79
80# PyBuilder
81.pybuilder/
82target/
83
84# Jupyter Notebook
85.ipynb_checkpoints
86
87# IPython
88profile_default/
89ipython_config.py
90
91# pyenv
92# For a library or package, you might want to ignore these files since the code is
93# intended to run in multiple environments; otherwise, check them in:
94.python-version
95
96# pdm
97# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
98#pdm.lock
99# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
100# in version control.
101# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
102.pdm.toml
103.pdm-python
104.pdm-build/
105
106# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
107__pypackages__/
108
109# Celery stuff
110celerybeat-schedule
111celerybeat.pid
112
113# SageMath parsed files
114*.sage.py
115
116# Environments
117.env
118.venv
119env/
120venv/
121ENV/
122env.bak/
123venv.bak/
124
125# Spyder project settings
126.spyderproject
127.spyproject
128
129# Rope project settings
130.ropeproject
131
132# mkdocs documentation
133/site
134
135# mypy
136.mypy_cache/
137.dmypy.json
138dmypy.json
139
140# Pyre type checker
141.pyre/
142
143# pytype static type analyzer
144.pytype/
145
146# Cython debug symbols
147cython_debug/
148
149# PyCharm
150# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
151# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152# and can be added to the global gitignore or merged into this file. For a more nuclear
153# option (not recommended) you can uncomment the following to ignore the entire idea folder.
154.idea/
155
156# Added by Apify CLI
157node_modules
requirements.txt
1# Feel free to add your Python dependencies below. For formatting guidelines, see:
2# https://pip.pypa.io/en/latest/reference/requirements-file-format/
3
4apify < 3.0
5parsel
6httpx
Pricing
Pricing model
Pay per usageThis Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.