
For Sale By Owner Scraper
Pricing
Pay per usage
Go to Store

For Sale By Owner Scraper
FSBO Detail Scraper is an innovative tool designed for real estate investors targeting off-market, discounted properties. This scraper specializes in extracting detailed listings of "For Sale by Owner" (FSBO) properties from www.fsbo.com. It's a subscription-based service that provides comprehensive
0.0 (0)
Pricing
Pay per usage
7
Total users
103
Monthly users
24
Runs succeeded
90%
Issues response
88 days
Last modified
a year ago
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
# Added by Apify CLI.venv
package.json
{ "name": "fsbo-scraper", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "dependencies": { "apify": "^3.1.10", "crawlee": "^3.7.1", "lodash": "^4.17.21", "puppeteer": "*", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-stealth": "^2.11.2" }, "devDependencies": { "@apify/eslint-config": "^0.4.0", "eslint": "^8.50.0" }, "scripts": { "start": "node src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://crawlee.dev/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node-puppeteer-chrome:18
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image. If you know you won't need headful browsers,# you can remove the XVFB start script for a micro perf gain.CMD ./start_xvfb_and_run_cmd.sh && npm start --silent
.actor/actor.json
{ "actorSpecification": 1, "name": "fsbo-scraper", "title": "Crawl For Sale By Owner properties from fsbo.com", "description": "Crawlee and Puppeteer project in JavaScript.", "version": "0.0", "meta": { "templateId": "js-crawlee-puppeteer-chrome" }, "input": "./input_schema.json", "dockerfile": "./Dockerfile"}
.actor/input_schema.json
{ "title": "PlaywrightCrawler Template", "type": "object", "schemaVersion": 1, "properties": { "searchQueries": { "title": "Address (City/Zip)", "type": "array", "description": "Where would you like to find FSBO leads?", "editor": "stringList", "pattern": "string", "placeholderValue": "Tampa, FL", "prefill": [ "Tampa, FL" ] }, "maxPages": { "title": "Max pages per address search", "type": "integer", "description": "", "minimum": 1, "maximum": 5 }, "proxyConfiguration": { "title": "Proxy configuration", "type": "object", "description": "Select proxies to be used by your crawler.", "prefill": { "apifyProxyGroups": [ "RESIDENTIAL" ] }, "editor": "proxy" } }, "required": ["searchQueries"]}
src/main.js
1import { Actor } from 'apify';2import { launchPuppeteer, sleep, CheerioCrawler, log } from 'crawlee';3import _ from 'lodash';4await Actor.init();5const results = {}6// Create an instance of the CheerioCrawler class - a crawler7// that automatically loads the URLs and parses their HTML using the cheerio library.8const cheerioCrawler = new CheerioCrawler({9 // The crawler downloads and processes the web pages in parallel, with a concurrency10 // automatically managed based on the available system memory and CPU (see AutoscaledPool class).11 // Here we define some hard limits for the concurrency.12 minConcurrency: 10,13 maxConcurrency: 50,14
15 // On error, retry each page at most once.16 maxRequestRetries: 3,17
18 // Increase the timeout for processing of each page.19 requestHandlerTimeoutSecs: 30,20
21 // Limit to 10 requests per one crawl22 maxRequestsPerCrawl: 1000,23
24 // This function will be called for each URL to crawl.25 // It accepts a single parameter, which is an object with options as:26 // https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler27 // We use for demonstration only 2 of them:28 // - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method29 // - $: the cheerio object containing parsed HTML30 async requestHandler({ request, $ }) {31 log.debug(`Processing ${request.url}...`);32
33 // Store the results to the dataset. In local configuration,34 // the data will be stored as JSON files in ./storage/datasets/default35 await Actor.pushData({36 url: request.url,37 ...results[request.url],38 seller: $('.modal-body div').eq(2).children().eq(1).text().trim(),39 phone: $('.modal-body div').eq(2).children().eq(3).text().trim(),40 price: $('.price').text().trim(),41 });42 },43
44 // This function is called if the page processing failed more than maxRequestRetries + 1 times.45 failedRequestHandler({ request }) {46 log.debug(`Request ${request.url} failed twice.`);47 },48});49
50const input = await Actor.getInput();51const maxPages = input.maxPages || 1;52console.log(`maxPages ${maxPages}`);53if (!input.searchQueries?.length) {54 Actor.abort('Input must contain at least one search query');55}56else {57 // Launch the web browser.58 const browser = await launchPuppeteer({59 launchOptions: {60 headless: false,61 args: ['--no-sandbox'],62 },63 });64 let res = [];65
66 for (const searchQuery of input.searchQueries) {67 console.log(searchQuery)68
69 // Create and navigate new page70 console.log('Open target page');71 const page = await browser.newPage();72 await page.goto('https://fsbo.com/listings/search/results/');73
74 await page.type('.search-query', searchQuery);75 await page.evaluate(() => {76 document.querySelector("#RefineSearchDistance").value = '100';77 document.querySelector("form .btn-fsbo-primary").click()78 })79 await sleep(3000)80
81 // Submit the form and wait for full load of next page 82
83 let hasNextPage = true;84
85 while (hasNextPage) {86 console.log('next page')87 const currentPageResults = await page.evaluate(() => {88 const result = [];89 for (const listing of document.querySelectorAll('.listing-item')) {90 const askingPrice = listing.querySelector('.listing-right h4').textContent;91 const address1 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[1];92 const address2 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[2];93
94 const zipcode = address2.slice(-5);95 const city = address2.replace(zipcode, '').trim();96 result.push({97 url: listing.querySelector('a').href,98 askingPrice,99 address1,100 address2,101 city,102 zipcode,103 title: listing.querySelector('h4').textContent,104 });105 }106
107 return result;108 });109
110 hasNextPage = await page.evaluate(() => {111 return !!document.querySelector('.nextPage');112 });113 console.log(`Current page: ${currentPageResults.length.length} | ${currentPageResults[0].url}`)114 res = [...res, ...currentPageResults];115 if (hasNextPage) {116
117 console.log('clicking nextpage')118 await page.click('.nextPage')119 await sleep(5000)120 }121 }122
123
124 res.forEach(r => {125 results[r.url] = { ...r }126 });127 console.log(`All: ${res.length}`);128 const allPages = _.uniq(res.map(r => r.url));129 console.log(allPages)130 console.log(`Unique Pages: ${allPages.length}`)131 await cheerioCrawler.run(allPages);132
133 // Store data in default dataset134 // await Actor.pushData(currentPageResults);135
136 // Close browser137 await browser.close();138 }139 await Actor.exit();140}
src/routes.js
1import { Dataset, createPuppeteerRouter } from 'crawlee';2
3export const router = createPuppeteerRouter();4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {6 log.info(`enqueueing new URLs`);7 await enqueueLinks({8 globs: ['https://www.zillow.com/ybor-city-tampa-fl/?searchQueryState=%7B%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22north%22%3A27.98882307307199%2C%22south%22%3A27.957592306053694%2C%22east%22%3A-82.42739039666748%2C%22west%22%3A-82.46936160333252%7D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A15%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A126709%2C%22regionType%22%3A8%7D%5D%2C%22pagination%22%3A%7B%7D%7D'],9 label: 'detail',10 });11});12
13router.addHandler('detail', async ({ request, page, log }) => {14 await page.type('input', 'tampa, fl');15 await page.click('.btn-fsbo-primary');16 await page.waitForNavigation();17 const title = await page.title();18 log.info(`${title}`, { url: request.loadedUrl });19 const num = await page.evaluate(() => {20 return document.querySelector(".detail-box-phone a").text;21 })22 const listings = document.querySelectorAll(".listing-item div h4").textContent;23 for (let listing of listings) {24 await Dataset.pushData({25 url: listing,26 // title,27 // num,28 });29 }30
31});
src/service.js
1