Spareroom avatar

Spareroom

Deprecated
View all Actors
This Actor is deprecated

This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?

See alternative Actors
Spareroom

Spareroom

zyberg/spareroom

Scrape Spare Room listings by postcodes

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "moment": "latest",
10        "lodash": "latest"
11    },
12    "scripts": {
13        "start": "node main.js"
14    }
15}

main.js

1const Apify = require('apify');
2//var moment = require('moment');
3//var _ = require('lodash');
4
5const templateResult = {
6	"Title": '',
7	"All Images": [],
8	"Description": '',
9	"Postcode": null,
10	"Rent": null,
11
12	"Furnishings": null,
13	"Parking": null,
14	"Garage": null,
15	"Garden/terrace": null,
16	"Balcony/Patio": null,
17	"Disabled access": null,
18	"Living room": null,
19	"Broadband": null,
20	"# housemates": null, //???
21	"Total # rooms": null, //???
22	"Couples OK?": null,
23	"Smoking OK?": null,
24	"Pets OK?": null,
25	"Occupation": null,
26	"Housing Benefit": null,
27	"References?": null,
28	"Max age": null,
29	"Gender": null,
30}
31
32
33Apify.main(async () => {
34  const requestQueue = await Apify.openRequestQueue('spareroom');
35  const requestQueueP = await Apify.openRequestQueue('spareroomP');
36  const dataset = await Apify.openDataset('spareroom');
37
38  const input = await Apify.getInput();
39
40    await requestQueueP.addRequest({
41        url: 'https://www.spareroom.co.uk/',
42        userData: {
43        	postcodes: input.postcodes,
44        }
45    });
46
47    let urlsLists = [ ];
48  const pptrCrawler = new Apify.PuppeteerCrawler({
49  	requestQueue: requestQueueP,
50  	handlePageFunction: async ({ request, page }) => {
51  		for (const postcode of request.userData.postcodes) {
52  			const selectorSearch = '#search_by_location_field';
53  			const selectorSearchSubmit = '#search_by_location_submit_button';
54  			const selectorLoaded = '.listing-result';
55
56  			if (page.$(selectorSearch) === null)
57  				await page.waitForSelector(selectorSearch);
58
59  			await page.focus(selectorSearch);
60    	    await page.keyboard.type(postcode);
61    	    await page.click(selectorSearchSubmit);
62  			
63  			await page.waitForSelector(selectorLoaded);
64  			urlsLists.push({ url: await page.url(), postcode });
65
66  			await page.goBack();
67  		}
68  	}
69  });
70
71  await pptrCrawler.run();
72
73  for (let url of urlsLists)
74  	await requestQueue.addRequest({
75  		url: url.url,
76  		userData: {
77  			postcode: url.postcode,
78  			label: "LIST"
79  		}
80  	});
81
82
83  const crawler = new Apify.CheerioCrawler({
84    requestQueue,
85    handlePageFunction: async ({ request, html, $ }) => {
86      const title = $("title").text();
87
88      const generateResult = () => {
89        const result = JSON.parse(JSON.stringify(templateResult));
90
91        const title = $('#listing_heading h1').first().text().trim();
92        result['Title'] = title;
93
94        $('a.photoswipe_me.img').each((index, el) => {
95          result['All Images'].push($(el).attr("href"));
96        });
97
98        const description = $('.detaildesc').first().text().trim();
99        result['Description'] = description;
100
101        result['Postcode'] = request.userData.postcode;
102
103        result['Rent'] = [];
104        $('.room-list__room').each((index, el) => {
105        	if ($(el).find('s').length == 0) {
106	        	const strPrice = $(el).find('.room-list__price').first().text().trim();
107	        	const strSize = $(el).find('.room-list__room small').first().text().trim(); 
108	        	
109	        	result['Rent'].push({
110    	    		'Price': strPrice.match(/\d+/g)[0],
111        			'Size': strSize.match(/\w+/g)[0], 
112        			'Per': strPrice.match(/(?<=\d )[\w]+/g)[0],
113        		});
114	        }
115        });
116
117        const furnishings = html.match(/(?<=Furnishings<\/dt>\s+<dd class="feature-list__value">).+</g);
118        const parking = html.match(/(?<=Parking<\/dt>\s+<dd class="feature-list__value">).+</g);
119        const garage = html.match(/(?<=Garage<\/dt>\s+<dd class="feature-list__value">).+</g);
120        const garden = html.match(/(?<=Garden\/terrace<\/dt>\s+<dd class="feature-list__value">).+</g);
121        const balcony = html.match(/(?<=Balcony\/Patio<\/dt>\s+<dd class="feature-list__value">).+</g);
122        const disabled = html.match(/(?<=Disabled access<\/dt>\s+<dd class="feature-list__value">).+</g);
123        const living = html.match(/(?<=Living room<\/dt>\s+<dd class="feature-list__value">).+</g);
124        const broadband = html.match(/(?<=Broadband included<\/dt>\s+<dd class="feature-list__value">).+</g);
125        const housemates = html.match(/(?<=# housemates<\/dt>\s+<dd class="feature-list__value">).+</g);
126        const rooms = html.match(/(?<=Total # rooms<\/dt>\s+<dd class="feature-list__value">).+</g);
127        const couples = html.match(/(?<=Couples OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
128        const smoking = html.match(/(?<=Smoking OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
129        const pets = html.match(/(?<=Pets OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
130        const occupation = html.match(/(?<=Occupation<\/dt>\s+<dd class="feature-list__value">).+</g);
131        const benefits = html.match(/(?<=Housing Benefit<\/dt>\s+<dd class="feature-list__value">).+</g);
132        const references = html.match(/(?<=References\?<\/dt>\s+<dd class="feature-list__value">).+</g);
133        const maxage = html.match(/(?<=Max age<\/dt>\s+<dd class="feature-list__value">).+</g);
134        const gender = html.match(/(?<=Gender<\/dt>\s+<dd class="feature-list__value">).+</g);
135
136        result['Furnishings'] = furnishings ? furnishings[0] : null;
137        result['Parking'] = parking ? parking[0] : null;
138        result['Garage'] = garage ? garage[0] : null;
139        result['Garden/terrace'] = garden ? garden[0] : null;
140        result['Balcony/Patio'] = balcony ? balcony[0] : null;
141        result['Disabled access'] = disabled ? disabled[0] : null;
142        result['Living room'] = living ? living[0] : null;
143        result['Broadband'] = broadband ? broadband[0] : null;
144        result['# housemates'] = housemates ? housemates[0] : null;
145        result['Total # rooms'] = rooms ? rooms[0] : null;
146        result['Couples OK?'] = couples ? couples[0] : null;
147        result['Smoking OK?'] = smoking ? smoking[0] : null;
148        result['Pets OK?'] = pets ? pets[0] : null;
149        result['Occupation'] = occupation ? occupation[0] : null;
150        result['Housing Benefit'] = benefits ? benefits[0] : null;
151        result['References?'] = references ? references[0] : null;
152        result['Max age'] = maxage ? maxage[0] : null;
153        result['Gender'] = gender ? gender[0] : null;
154
155        result['Furnishings'] = result['Furnishings'] ? result['Furnishings'].replace('<', '') : null;
156        result['Parking'] = result['Parking'] ? result['Parking'].replace('<', '') : null;
157        result['Garage'] = result['Garage'] ? result['Garage'].replace('<', '') : null;
158        result['Garden/terrace'] = result['Garden/terrace'] ? result['Garden/terrace'].replace('<', '') : null;
159        result['Balcony/Patio'] = result['Balcony/Patio'] ? result['Balcony/Patio'].replace('<', '') : null;
160        result['Disabled access'] = result['Disabled access'] ? result['Disabled access'].replace('<', '') : null;
161        result['Living room'] = result['Living room'] ? result['Living room'].replace('<', '') : null;
162        result['Broadband'] = result['Broadband'] ? result['Broadband'].replace('<', '') : null;
163        result['# housemates'] = result['# housemates'] ? result['# housemates'].replace('<', '') : null;
164        result['Total # rooms'] = result['Total # rooms'] ? result['Total # rooms'].replace('<', '') : null;
165        result['Couples OK?'] = result['Couples OK?'] ? result['Couples OK?'].replace('<', '') : null;
166        result['Smoking OK?'] = result['Smoking OK?'] ? result['Smoking OK?'].replace('<', '') : null;
167        result['Pets OK?'] = result['Pets OK?'] ? result['Pets OK?'].replace('<', '') : null;
168        result['Occupation'] = result['Occupation'] ? result['Occupation'].replace('<', '') : null;
169        result['Max age'] = result['Max age'] ? result['Max age'].replace('<', '') : null;
170        result['Gender'] = result['Gender'] ? result['Gender'].replace('<', '') : null;
171
172        const toClean = [
173        	'Furnishings',
174        	'Parking',
175        	'Garage',
176        	'Garden/terrace',
177        	'Balcony/Patio',
178        	'Disabled access',
179        	'Living room',
180        	'Broadband',
181        	'# housemates',
182        	'Total # rooms',
183        	'Couples OK?',
184        	'Smoking OK?',
185        	'Pets OK?',
186        	'Occupation',
187        	'Max age',
188        	'Gender',
189        	'Housing Benefit',
190        	'References?'
191        ];
192
193        for (const c of toClean){ 
194    		if (result[c]) {
195    			if (result[c].match(/Yes/g))
196    				result[c] = "Yes";
197    			else if (result[c].match(/No/g))
198    				result[c] = "No";
199    			else if (result[c].includes('span') && result[c].match(/(?<=>).+</g))
200    				result[c] = result[c].match(/(?<=>).+</g)[0].replace('<', '');
201    		}
202        }
203
204        result['Living room'] = result['Living room'] ? result['Living room'].replace('span', '').replace('/', '').replace('>', '').replace('<', '') : null;
205        result['Occupation'] = result['Occupation'] ? result['Occupation'] : $('.feature.feature--household-preferences').first().text().match(/(?<=Occupation\s+)(\w|\w )+/g);
206
207        return result;
208      };
209      
210      const output = {
211        "error": null,
212        "data": generateResult(),
213      };
214
215      const run = {
216      	scrapeList: async () => {
217      		$('.listing-result:not(:first-child) header.desktop a').each(async (index, el) => {
218				await requestQueue.addRequest({
219           			url: 'https://www.spareroom.co.uk/' + $(el).attr('href'), 
220            		userData: {
221            			label: 'ONE',
222            			postcode: request.userData.postcode,
223            		}
224        		});
225        	});
226
227      		if ($('#paginationNextPageLink').length != 0)
228      			await requestQueue.addRequest({
229           			url: 'https://www.spareroom.co.uk/flatshare/' +  $('#paginationNextPageLink').first().attr('href'),
230            		userData: {
231            			label: 'LIST',
232            			postcode: request.userData.postcode,
233            		}
234        		});
235
236      	},
237
238      	scrapeOne: async () => {
239      		const output = {
240      			"error": null,
241      			"data": generateResult(),
242      		};
243
244      		console.log(output)
245
246		    await dataset.pushData(output);
247      	},
248      };
249
250      if (request.userData.label === 'LIST') await run.scrapeList();
251      else if (request.userData.label === 'ONE') await run.scrapeOne();
252
253    }
254  });
255
256  await crawler.run();
257});
Developer
Maintained by Community
Categories