1import { Actor } from 'apify';
2import { launchPuppeteer, sleep, CheerioCrawler, log } from 'crawlee';
3import _ from 'lodash';
4await Actor.init();
5const results = {}
6
7
8const cheerioCrawler = new CheerioCrawler({
9
10
11
12 minConcurrency: 10,
13 maxConcurrency: 50,
14
15
16 maxRequestRetries: 3,
17
18
19 requestHandlerTimeoutSecs: 30,
20
21
22 maxRequestsPerCrawl: 1000,
23
24
25
26
27
28
29
30 async requestHandler({ request, $ }) {
31 log.debug(`Processing ${request.url}...`);
32
33
34
35 await Actor.pushData({
36 url: request.url,
37 ...results[request.url],
38 seller: $('.modal-body div').eq(2).children().eq(1).text().trim(),
39 phone: $('.modal-body div').eq(2).children().eq(3).text().trim(),
40 price: $('.price').text().trim(),
41 });
42 },
43
44
45 failedRequestHandler({ request }) {
46 log.debug(`Request ${request.url} failed twice.`);
47 },
48});
49
50const input = await Actor.getInput();
51const maxPages = input.maxPages || 1;
52console.log(`maxPages ${maxPages}`);
53if (!input.searchQueries?.length) {
54 Actor.abort('Input must contain at least one search query');
55}
56else {
57
58 const browser = await launchPuppeteer({
59 launchOptions: {
60 headless: false,
61 args: ['--no-sandbox'],
62 },
63 });
64 let res = [];
65
66 for (const searchQuery of input.searchQueries) {
67 console.log(searchQuery)
68
69
70 console.log('Open target page');
71 const page = await browser.newPage();
72 await page.goto('https://fsbo.com/listings/search/results/');
73
74 await page.type('.search-query', searchQuery);
75 await page.evaluate(() => {
76 document.querySelector("#RefineSearchDistance").value = '100';
77 document.querySelector("form .btn-fsbo-primary").click()
78 })
79 await sleep(3000)
80
81
82
83 let hasNextPage = true;
84
85 while (hasNextPage) {
86 console.log('next page')
87 const currentPageResults = await page.evaluate(() => {
88 const result = [];
89 for (const listing of document.querySelectorAll('.listing-item')) {
90 const askingPrice = listing.querySelector('.listing-right h4').textContent;
91 const address1 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[1];
92 const address2 = listing.querySelector('.listing-right').innerText.replace(askingPrice, '').split('\n')[2];
93
94 const zipcode = address2.slice(-5);
95 const city = address2.replace(zipcode, '').trim();
96 result.push({
97 url: listing.querySelector('a').href,
98 askingPrice,
99 address1,
100 address2,
101 city,
102 zipcode,
103 title: listing.querySelector('h4').textContent,
104 });
105 }
106
107 return result;
108 });
109
110 hasNextPage = await page.evaluate(() => {
111 return !!document.querySelector('.nextPage');
112 });
113 console.log(`Current page: ${currentPageResults.length.length} | ${currentPageResults[0].url}`)
114 res = [...res, ...currentPageResults];
115 if (hasNextPage) {
116
117 console.log('clicking nextpage')
118 await page.click('.nextPage')
119 await sleep(5000)
120 }
121 }
122
123
124 res.forEach(r => {
125 results[r.url] = { ...r }
126 });
127 console.log(`All: ${res.length}`);
128 const allPages = _.uniq(res.map(r => r.url));
129 console.log(allPages)
130 console.log(`Unique Pages: ${allPages.length}`)
131 await cheerioCrawler.run(allPages);
132
133
134
135
136
137 await browser.close();
138 }
139 await Actor.exit();
140}