For Sale By Owner Scraper avatar
For Sale By Owner Scraper

Pricing

Pay per usage

Go to Store
For Sale By Owner Scraper

For Sale By Owner Scraper

Developed by

Kobi

Kobi

Maintained by Community

FSBO Detail Scraper is an innovative tool designed for real estate investors targeting off-market, discounted properties. This scraper specializes in extracting detailed listings of "For Sale by Owner" (FSBO) properties from www.fsbo.com. It's a subscription-based service that provides comprehensive

0.0 (0)

Pricing

Pay per usage

7

Total users

103

Monthly users

24

Runs succeeded

90%

Issues response

88 days

Last modified

a year ago

.dockerignore

# configurations
.idea
# crawlee and apify storage folders
apify_storage
crawlee_storage
storage
# installed files
node_modules
# git folder
.git

.editorconfig

root = true
[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf

.eslintrc

{
"extends": "@apify",
"root": true
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.DS_Store
.idea
dist
node_modules
apify_storage
storage
# Added by Apify CLI
.venv

package.json

{
"name": "fsbo-scraper",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.1.10",
"crawlee": "^3.7.1",
"lodash": "^4.17.21",
"puppeteer": "*",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"
},
"devDependencies": {
"@apify/eslint-config": "^0.4.0",
"eslint": "^8.50.0"
},
"scripts": {
"start": "node src/main.js",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

.actor/Dockerfile

# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-puppeteer-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image. If you know you won't need headful browsers,
# you can remove the XVFB start script for a micro perf gain.
CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

{
"actorSpecification": 1,
"name": "fsbo-scraper",
"title": "Crawl For Sale By Owner properties from fsbo.com",
"description": "Crawlee and Puppeteer project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-puppeteer-chrome"
},
"input": "./input_schema.json",
"dockerfile": "./Dockerfile"
}

.actor/input_schema.json

{
"title": "PlaywrightCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"searchQueries": {
"title": "Address (City/Zip)",
"type": "array",
"description": "Where would you like to find FSBO leads?",
"editor": "stringList",
"pattern": "string",
"placeholderValue": "Tampa, FL",
"prefill": [
"Tampa, FL"
]
},
"maxPages": {
"title": "Max pages per address search",
"type": "integer",
"description": "",
"minimum": 1,
"maximum": 5
},
"proxyConfiguration": {
"title": "Proxy configuration",
"type": "object",
"description": "Select proxies to be used by your crawler.",
"prefill": {
"apifyProxyGroups": [
"RESIDENTIAL"
]
},
"editor": "proxy"
}
},
"required": ["searchQueries"]
}

src/main.js

1import { Actor } from 'apify';
2import { launchPuppeteer, sleep, CheerioCrawler, log } from 'crawlee';
3import _ from 'lodash';
4await Actor.init();
5const results = {}
6// Create an instance of the CheerioCrawler class - a crawler
7// that automatically loads the URLs and parses their HTML using the cheerio library.
8const cheerioCrawler = new CheerioCrawler({
9 // The crawler downloads and processes the web pages in parallel, with a concurrency
10 // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
11 // Here we define some hard limits for the concurrency.
12 minConcurrency: 10,
13 maxConcurrency: 50,
14
15 // On error, retry each page at most once.
16 maxRequestRetries: 3,
17
18 // Increase the timeout for processing of each page.
19 requestHandlerTimeoutSecs: 30,
20
21 // Limit to 10 requests per one crawl
22 maxRequestsPerCrawl: 1000,
23
24 // This function will be called for each URL to crawl.
25 // It accepts a single parameter, which is an object with options as:
26 // https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler
27 // We use for demonstration only 2 of them:
28 // - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method
29 // - $: the cheerio object containing parsed HTML
30 async requestHandler({ request, $ }) {
31 log.debug(`Processing ${request.url}...`);
32
33 // Store the results to the dataset. In local configuration,
34 // the data will be stored as JSON files in ./storage/datasets/default
35 await Actor.pushData({
36 url: request.url,
37 ...results[request.url],
38 seller: $('.modal-body div').eq(2).children().eq(1).text().trim(),
39 phone: $('.modal-body div').eq(2).children().eq(3).text().trim(),
40 price: $('.price').text().trim(),
41 });
42 },
43
44 // This function is called if the page processing failed more than maxRequestRetries + 1 times.
45 failedRequestHandler({ request }) {
46 log.debug(`Request ${request.url} failed twice.`);
47 },
48});
49
50const input = await Actor.getInput();
51const maxPages = input.maxPages || 1;
52console.log(`maxPages ${maxPages}`);
53if (!input.searchQueries?.length) {
54 Actor.abort('Input must contain at least one search query');
55}
56else {
57 // Launch the web browser.
58 const browser = await launchPuppeteer({
59 launchOptions: {
60 headless: false,
61 args: ['--no-sandbox'],
62 },
63 });
64 let res = [];
65
66 for (const searchQuery of input.searchQueries) {
67 console.log(searchQuery)
68
69 // Create and navigate new page
70 console.log('Open target page');
71 const page = await browser.newPage();
72 await page.goto('https://fsbo.com/listings/search/results/');
73
74 await page.type('.search-query', searchQuery);
75 await page.evaluate(() => {
76 document.querySelector("#RefineSearchDistance").value = '100';
77 document.querySelector("form .btn-fsbo-primary").click()
78 })
79 await sleep(3000)
80
81 // Submit the form and wait for full load of next page
82
83 let hasNextPage = true;
84
85 while (hasNextPage) {
86 console.log('next page')
87 const currentPageResults = await page.evaluate(() => {
88 const result = [];
89 for (const listing of document.querySelectorAll('.listing-item')) {
90 const askingPrice = listing.querySelector('.listing-right h4').textContent;
91 const address1 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[1];
92 const address2 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[2];
93
94 const zipcode = address2.slice(-5);
95 const city = address2.replace(zipcode, '').trim();
96 result.push({
97 url: listing.querySelector('a').href,
98 askingPrice,
99 address1,
100 address2,
101 city,
102 zipcode,
103 title: listing.querySelector('h4').textContent,
104 });
105 }
106
107 return result;
108 });
109
110 hasNextPage = await page.evaluate(() => {
111 return !!document.querySelector('.nextPage');
112 });
113 console.log(`Current page: ${currentPageResults.length.length} | ${currentPageResults[0].url}`)
114 res = [...res, ...currentPageResults];
115 if (hasNextPage) {
116
117 console.log('clicking nextpage')
118 await page.click('.nextPage')
119 await sleep(5000)
120 }
121 }
122
123
124 res.forEach(r => {
125 results[r.url] = { ...r }
126 });
127 console.log(`All: ${res.length}`);
128 const allPages = _.uniq(res.map(r => r.url));
129 console.log(allPages)
130 console.log(`Unique Pages: ${allPages.length}`)
131 await cheerioCrawler.run(allPages);
132
133 // Store data in default dataset
134 // await Actor.pushData(currentPageResults);
135
136 // Close browser
137 await browser.close();
138 }
139 await Actor.exit();
140}

src/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6 log.info(`enqueueing new URLs`);
7 await enqueueLinks({
8 globs: ['https://www.zillow.com/ybor-city-tampa-fl/?searchQueryState=%7B%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22north%22%3A27.98882307307199%2C%22south%22%3A27.957592306053694%2C%22east%22%3A-82.42739039666748%2C%22west%22%3A-82.46936160333252%7D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A15%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A126709%2C%22regionType%22%3A8%7D%5D%2C%22pagination%22%3A%7B%7D%7D'],
9 label: 'detail',
10 });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14 await page.type('input', 'tampa, fl');
15 await page.click('.btn-fsbo-primary');
16 await page.waitForNavigation();
17 const title = await page.title();
18 log.info(`${title}`, { url: request.loadedUrl });
19 const num = await page.evaluate(() => {
20 return document.querySelector(".detail-box-phone a").text;
21 })
22 const listings = document.querySelectorAll(".listing-item div h4").textContent;
23 for (let listing of listings) {
24 await Dataset.pushData({
25 url: listing,
26 // title,
27 // num,
28 });
29 }
30
31});

src/service.js

1