For Sale By Owner Scraper
Try for free
No credit card required
Go to Store
For Sale By Owner Scraper
apifyscraper/fsbo-scraper
Try for free
No credit card required
FSBO Detail Scraper is an innovative tool designed for real estate investors targeting off-market, discounted properties. This scraper specializes in extracting detailed listings of "For Sale by Owner" (FSBO) properties from www.fsbo.com. It's a subscription-based service that provides comprehensive
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv
package.json
1{
2 "name": "fsbo-scraper",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.1.10",
8 "crawlee": "^3.7.1",
9 "lodash": "^4.17.21",
10 "puppeteer": "*",
11 "puppeteer-extra": "^3.3.6",
12 "puppeteer-extra-plugin-stealth": "^2.11.2"
13 },
14 "devDependencies": {
15 "@apify/eslint-config": "^0.4.0",
16 "eslint": "^8.50.0"
17 },
18 "scripts": {
19 "start": "node src/main.js",
20 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
21 },
22 "author": "It's not you it's me",
23 "license": "ISC"
24}
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "fsbo-scraper",
4 "title": "Crawl For Sale By Owner properties from fsbo.com",
5 "description": "Crawlee and Puppeteer project in JavaScript.",
6 "version": "0.0",
7 "meta": {
8 "templateId": "js-crawlee-puppeteer-chrome"
9 },
10 "input": "./input_schema.json",
11 "dockerfile": "./Dockerfile"
12}
.actor/input_schema.json
1{
2 "title": "PlaywrightCrawler Template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "searchQueries": {
7 "title": "Address (City/Zip)",
8 "type": "array",
9 "description": "Where would you like to find FSBO leads?",
10 "editor": "stringList",
11 "pattern": "string",
12 "placeholderValue": "Tampa, FL",
13 "prefill": [
14 "Tampa, FL"
15 ]
16 },
17 "maxPages": {
18 "title": "Max pages per address search",
19 "type": "integer",
20 "description": "",
21 "minimum": 1,
22 "maximum": 5
23 },
24 "proxyConfiguration": {
25 "title": "Proxy configuration",
26 "type": "object",
27 "description": "Select proxies to be used by your crawler.",
28 "prefill": {
29 "apifyProxyGroups": [
30 "RESIDENTIAL"
31 ]
32 },
33 "editor": "proxy"
34 }
35 },
36 "required": ["searchQueries"]
37}
src/main.js
1import { Actor } from 'apify';
2import { launchPuppeteer, sleep, CheerioCrawler, log } from 'crawlee';
3import _ from 'lodash';
4await Actor.init();
5const results = {}
6// Create an instance of the CheerioCrawler class - a crawler
7// that automatically loads the URLs and parses their HTML using the cheerio library.
8const cheerioCrawler = new CheerioCrawler({
9 // The crawler downloads and processes the web pages in parallel, with a concurrency
10 // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
11 // Here we define some hard limits for the concurrency.
12 minConcurrency: 10,
13 maxConcurrency: 50,
14
15 // On error, retry each page at most once.
16 maxRequestRetries: 3,
17
18 // Increase the timeout for processing of each page.
19 requestHandlerTimeoutSecs: 30,
20
21 // Limit to 10 requests per one crawl
22 maxRequestsPerCrawl: 1000,
23
24 // This function will be called for each URL to crawl.
25 // It accepts a single parameter, which is an object with options as:
26 // https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler
27 // We use for demonstration only 2 of them:
28 // - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method
29 // - $: the cheerio object containing parsed HTML
30 async requestHandler({ request, $ }) {
31 log.debug(`Processing ${request.url}...`);
32
33 // Store the results to the dataset. In local configuration,
34 // the data will be stored as JSON files in ./storage/datasets/default
35 await Actor.pushData({
36 url: request.url,
37 ...results[request.url],
38 seller: $('.modal-body div').eq(2).children().eq(1).text().trim(),
39 phone: $('.modal-body div').eq(2).children().eq(3).text().trim(),
40 price: $('.price').text().trim(),
41 });
42 },
43
44 // This function is called if the page processing failed more than maxRequestRetries + 1 times.
45 failedRequestHandler({ request }) {
46 log.debug(`Request ${request.url} failed twice.`);
47 },
48});
49
50const input = await Actor.getInput();
51const maxPages = input.maxPages || 1;
52console.log(`maxPages ${maxPages}`);
53if (!input.searchQueries?.length) {
54 Actor.abort('Input must contain at least one search query');
55}
56else {
57 // Launch the web browser.
58 const browser = await launchPuppeteer({
59 launchOptions: {
60 headless: false,
61 args: ['--no-sandbox'],
62 },
63 });
64 let res = [];
65
66 for (const searchQuery of input.searchQueries) {
67 console.log(searchQuery)
68
69 // Create and navigate new page
70 console.log('Open target page');
71 const page = await browser.newPage();
72 await page.goto('https://fsbo.com/listings/search/results/');
73
74 await page.type('.search-query', searchQuery);
75 await page.evaluate(() => {
76 document.querySelector("#RefineSearchDistance").value = '100';
77 document.querySelector("form .btn-fsbo-primary").click()
78 })
79 await sleep(3000)
80
81 // Submit the form and wait for full load of next page
82
83 let hasNextPage = true;
84
85 while (hasNextPage) {
86 console.log('next page')
87 const currentPageResults = await page.evaluate(() => {
88 const result = [];
89 for (const listing of document.querySelectorAll('.listing-item')) {
90 const askingPrice = listing.querySelector('.listing-right h4').textContent;
91 const address1 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[1];
92 const address2 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[2];
93
94 const zipcode = address2.slice(-5);
95 const city = address2.replace(zipcode, '').trim();
96 result.push({
97 url: listing.querySelector('a').href,
98 askingPrice,
99 address1,
100 address2,
101 city,
102 zipcode,
103 title: listing.querySelector('h4').textContent,
104 });
105 }
106
107 return result;
108 });
109
110 hasNextPage = await page.evaluate(() => {
111 return !!document.querySelector('.nextPage');
112 });
113 console.log(`Current page: ${currentPageResults.length.length} | ${currentPageResults[0].url}`)
114 res = [...res, ...currentPageResults];
115 if (hasNextPage) {
116
117 console.log('clicking nextpage')
118 await page.click('.nextPage')
119 await sleep(5000)
120 }
121 }
122
123
124 res.forEach(r => {
125 results[r.url] = { ...r }
126 });
127 console.log(`All: ${res.length}`);
128 const allPages = _.uniq(res.map(r => r.url));
129 console.log(allPages)
130 console.log(`Unique Pages: ${allPages.length}`)
131 await cheerioCrawler.run(allPages);
132
133 // Store data in default dataset
134 // await Actor.pushData(currentPageResults);
135
136 // Close browser
137 await browser.close();
138 }
139 await Actor.exit();
140}
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://www.zillow.com/ybor-city-tampa-fl/?searchQueryState=%7B%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22north%22%3A27.98882307307199%2C%22south%22%3A27.957592306053694%2C%22east%22%3A-82.42739039666748%2C%22west%22%3A-82.46936160333252%7D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A15%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A126709%2C%22regionType%22%3A8%7D%5D%2C%22pagination%22%3A%7B%7D%7D'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 await page.type('input', 'tampa, fl');
15 await page.click('.btn-fsbo-primary');
16 await page.waitForNavigation();
17 const title = await page.title();
18 log.info(`${title}`, { url: request.loadedUrl });
19 const num = await page.evaluate(() => {
20 return document.querySelector(".detail-box-phone a").text;
21 })
22 const listings = document.querySelectorAll(".listing-item div h4").textContent;
23 for (let listing of listings) {
24 await Dataset.pushData({
25 url: listing,
26 // title,
27 // num,
28 });
29 }
30
31});
src/service.js
Developer
Maintained by Community
Actor Metrics
10 monthly users
-
2 stars
90% runs succeeded
11 days response time
Created in Jan 2024
Modified 9 months ago
Categories