For Sale By Owner Scraper avatar

For Sale By Owner Scraper

Try for free

No credit card required

View all Actors
For Sale By Owner Scraper

For Sale By Owner Scraper

apifyscraper/fsbo-scraper
Try for free

No credit card required

FSBO Detail Scraper is an innovative tool designed for real estate investors targeting off-market, discounted properties. This scraper specializes in extracting detailed listings of "For Sale by Owner" (FSBO) properties from www.fsbo.com. It's a subscription-based service that provides comprehensive

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
9
10# Added by Apify CLI
11.venv

package.json

1{
2	"name": "fsbo-scraper",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is an example of an Apify actor.",
6	"dependencies": {
7		"apify": "^3.1.10",
8		"crawlee": "^3.7.1",
9		"lodash": "^4.17.21",
10		"puppeteer": "*",
11		"puppeteer-extra": "^3.3.6",
12		"puppeteer-extra-plugin-stealth": "^2.11.2"
13	},
14	"devDependencies": {
15		"@apify/eslint-config": "^0.4.0",
16		"eslint": "^8.50.0"
17	},
18	"scripts": {
19		"start": "node src/main.js",
20		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
21	},
22	"author": "It's not you it's me",
23	"license": "ISC"
24}

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://crawlee.dev/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node-puppeteer-chrome:18
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY --chown=myuser package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY --chown=myuser . ./
27
28
29# Run the image. If you know you won't need headful browsers,
30# you can remove the XVFB start script for a micro perf gain.
31CMD ./start_xvfb_and_run_cmd.sh && npm start --silent

.actor/actor.json

1{
2	"actorSpecification": 1,
3	"name": "fsbo-scraper",
4	"title": "Crawl For Sale By Owner properties from fsbo.com",
5	"description": "Crawlee and Puppeteer project in JavaScript.",
6	"version": "0.0",
7	"meta": {
8		"templateId": "js-crawlee-puppeteer-chrome"
9	},
10	"input": "./input_schema.json",
11	"dockerfile": "./Dockerfile"
12}

.actor/input_schema.json

1{
2    "title": "PlaywrightCrawler Template",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "searchQueries": {
7            "title": "Address (City/Zip)",
8            "type": "array",
9            "description": "Where would you like to find FSBO leads?",
10            "editor": "stringList",
11            "pattern": "string",
12            "placeholderValue": "Tampa, FL",
13            "prefill": [
14                "Tampa, FL"
15            ]
16        },
17        "maxPages": {
18            "title": "Max pages per address search",
19            "type": "integer",
20            "description": "",
21            "minimum": 1,
22            "maximum": 5
23        },
24        "proxyConfiguration": {
25            "title": "Proxy configuration",
26            "type": "object",
27            "description": "Select proxies to be used by your crawler.",
28            "prefill": {
29                "apifyProxyGroups": [
30                    "RESIDENTIAL"
31                ]
32            },
33            "editor": "proxy"
34        }
35    },
36    "required": ["searchQueries"]
37}

src/main.js

1import { Actor } from 'apify';
2import { launchPuppeteer, sleep, CheerioCrawler, log } from 'crawlee';
3import _ from 'lodash';
4await Actor.init();
5const results = {}
6// Create an instance of the CheerioCrawler class - a crawler
7// that automatically loads the URLs and parses their HTML using the cheerio library.
8const cheerioCrawler = new CheerioCrawler({
9    // The crawler downloads and processes the web pages in parallel, with a concurrency
10    // automatically managed based on the available system memory and CPU (see AutoscaledPool class).
11    // Here we define some hard limits for the concurrency.
12    minConcurrency: 10,
13    maxConcurrency: 50,
14
15    // On error, retry each page at most once.
16    maxRequestRetries: 3,
17
18    // Increase the timeout for processing of each page.
19    requestHandlerTimeoutSecs: 30,
20
21    // Limit to 10 requests per one crawl
22    maxRequestsPerCrawl: 1000,
23
24    // This function will be called for each URL to crawl.
25    // It accepts a single parameter, which is an object with options as:
26    // https://crawlee.dev/api/cheerio-crawler/interface/CheerioCrawlerOptions#requestHandler
27    // We use for demonstration only 2 of them:
28    // - request: an instance of the Request class with information such as the URL that is being crawled and HTTP method
29    // - $: the cheerio object containing parsed HTML
30    async requestHandler({ request, $ }) {
31        log.debug(`Processing ${request.url}...`);
32
33        // Store the results to the dataset. In local configuration,
34        // the data will be stored as JSON files in ./storage/datasets/default
35        await Actor.pushData({
36            url: request.url,
37            ...results[request.url],
38            seller: $('.modal-body div').eq(2).children().eq(1).text().trim(),
39            phone: $('.modal-body div').eq(2).children().eq(3).text().trim(),
40            price: $('.price').text().trim(),
41        });
42    },
43
44    // This function is called if the page processing failed more than maxRequestRetries + 1 times.
45    failedRequestHandler({ request }) {
46        log.debug(`Request ${request.url} failed twice.`);
47    },
48});
49
50const input = await Actor.getInput();
51const maxPages = input.maxPages || 1;
52console.log(`maxPages ${maxPages}`);
53if (!input.searchQueries?.length) {
54    Actor.abort('Input must contain at least one search query');
55}
56else {
57    // Launch the web browser.
58    const browser = await launchPuppeteer({
59        launchOptions: {
60            headless: false,
61            args: ['--no-sandbox'],
62        },
63    });
64    let res = [];
65
66    for (const searchQuery of input.searchQueries) {
67        console.log(searchQuery)
68
69        // Create and navigate new page
70        console.log('Open target page');
71        const page = await browser.newPage();
72        await page.goto('https://fsbo.com/listings/search/results/');
73
74        await page.type('.search-query', searchQuery);
75        await page.evaluate(() => {
76            document.querySelector("#RefineSearchDistance").value = '100';
77            document.querySelector("form .btn-fsbo-primary").click()
78        })
79        await sleep(3000)
80
81        // Submit the form and wait for full load of next page    
82
83        let hasNextPage = true;
84
85        while (hasNextPage) {
86            console.log('next page')
87            const currentPageResults = await page.evaluate(() => {
88                const result = [];
89                for (const listing of document.querySelectorAll('.listing-item')) {
90                    const askingPrice = listing.querySelector('.listing-right h4').textContent;
91                    const address1 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[1];
92                    const address2 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[2];
93
94                    const zipcode = address2.slice(-5);
95                    const city = address2.replace(zipcode, '').trim();
96                    result.push({
97                        url: listing.querySelector('a').href,
98                        askingPrice,
99                        address1,
100                        address2,
101                        city,
102                        zipcode,
103                        title: listing.querySelector('h4').textContent,
104                    });
105                }
106
107                return result;
108            });
109
110            hasNextPage = await page.evaluate(() => {
111                return !!document.querySelector('.nextPage');
112            });
113            console.log(`Current page: ${currentPageResults.length.length} | ${currentPageResults[0].url}`)
114            res = [...res, ...currentPageResults];
115            if (hasNextPage) {
116
117                console.log('clicking nextpage')
118                await page.click('.nextPage')
119                await sleep(5000)
120            }
121        }
122
123
124        res.forEach(r => {
125            results[r.url] = { ...r }
126        });
127        console.log(`All: ${res.length}`);
128        const allPages = _.uniq(res.map(r => r.url));
129        console.log(allPages)
130        console.log(`Unique Pages: ${allPages.length}`)
131        await cheerioCrawler.run(allPages);
132
133        // Store data in default dataset
134        // await Actor.pushData(currentPageResults);
135
136        // Close browser
137        await browser.close();
138    }
139    await Actor.exit();
140}

src/routes.js

1import { Dataset, createPuppeteerRouter } from 'crawlee';
2
3export const router = createPuppeteerRouter();
4
5router.addDefaultHandler(async ({ enqueueLinks, log }) => {
6    log.info(`enqueueing new URLs`);
7    await enqueueLinks({
8        globs: ['https://www.zillow.com/ybor-city-tampa-fl/?searchQueryState=%7B%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22north%22%3A27.98882307307199%2C%22south%22%3A27.957592306053694%2C%22east%22%3A-82.42739039666748%2C%22west%22%3A-82.46936160333252%7D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A15%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A126709%2C%22regionType%22%3A8%7D%5D%2C%22pagination%22%3A%7B%7D%7D'],
9        label: 'detail',
10    });
11});
12
13router.addHandler('detail', async ({ request, page, log }) => {
14    await page.type('input', 'tampa, fl');
15    await page.click('.btn-fsbo-primary');
16    await page.waitForNavigation();
17    const title = await page.title();
18    log.info(`${title}`, { url: request.loadedUrl });
19    const num = await page.evaluate(() => {
20        return document.querySelector(".detail-box-phone a").text;
21    })
22    const listings = document.querySelectorAll(".listing-item div h4").textContent;
23    for (let listing of listings) {
24        await Dataset.pushData({
25            url: listing,
26            // title,
27            // num,
28        });
29    }
30
31});

src/service.js

Developer
Maintained by Community
Actor metrics
  • 4 monthly users
  • 2 stars
  • 91.5% runs succeeded
  • Created in Jan 2024
  • Modified 8 months ago