Spareroom avatar
Spareroom

Deprecated

Pricing

Pay per usage

Go to Store
Spareroom

Spareroom

Deprecated

Developed by

Nikolajus Elmutis

Maintained by Community

Scrape Spare Room listings by postcodes

0.0 (0)

Pricing

Pay per usage

1

Monthly users

2

Last modified

3 years ago

Dockerfile

1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY --chown=myuser:myuser . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

1{
2    "name": "apify-project",
3    "version": "0.0.1",
4    "description": "",
5    "author": "It's not you it's me",
6    "license": "ISC",
7    "dependencies": {
8        "apify": "0.21.10",
9        "moment": "latest",
10        "lodash": "latest"
11    },
12    "scripts": {
13        "start": "node main.js"
14    }
15}

main.js

1const Apify = require('apify');
2//var moment = require('moment');
3//var _ = require('lodash');
4
5const templateResult = {
6	"Title": '',
7	"All Images": [],
8	"Description": '',
9	"Postcode": null,
10	"Rent": null,
11
12	"Furnishings": null,
13	"Parking": null,
14	"Garage": null,
15	"Garden/terrace": null,
16	"Balcony/Patio": null,
17	"Disabled access": null,
18	"Living room": null,
19	"Broadband": null,
20	"# housemates": null, //???
21	"Total # rooms": null, //???
22	"Couples OK?": null,
23	"Smoking OK?": null,
24	"Pets OK?": null,
25	"Occupation": null,
26	"Housing Benefit": null,
27	"References?": null,
28	"Max age": null,
29	"Gender": null,
30}
31
32
33Apify.main(async () => {
34  const requestQueue = await Apify.openRequestQueue('spareroom');
35  const requestQueueP = await Apify.openRequestQueue('spareroomP');
36  const dataset = await Apify.openDataset('spareroom');
37
38  const input = await Apify.getInput();
39
40    await requestQueueP.addRequest({
41        url: 'https://www.spareroom.co.uk/',
42        userData: {
43        	postcodes: input.postcodes,
44        }
45    });
46
47    let urlsLists = [ ];
48  const pptrCrawler = new Apify.PuppeteerCrawler({
49  	requestQueue: requestQueueP,
50  	handlePageFunction: async ({ request, page }) => {
51  		for (const postcode of request.userData.postcodes) {
52  			const selectorSearch = '#search_by_location_field';
53  			const selectorSearchSubmit = '#search_by_location_submit_button';
54  			const selectorLoaded = '.listing-result';
55
56  			if (page.$(selectorSearch) === null)
57  				await page.waitForSelector(selectorSearch);
58
59  			await page.focus(selectorSearch);
60    	    await page.keyboard.type(postcode);
61    	    await page.click(selectorSearchSubmit);
62  			
63  			await page.waitForSelector(selectorLoaded);
64  			urlsLists.push({ url: await page.url(), postcode });
65
66  			await page.goBack();
67  		}
68  	}
69  });
70
71  await pptrCrawler.run();
72
73  for (let url of urlsLists)
74  	await requestQueue.addRequest({
75  		url: url.url,
76  		userData: {
77  			postcode: url.postcode,
78  			label: "LIST"
79  		}
80  	});
81
82
83  const crawler = new Apify.CheerioCrawler({
84    requestQueue,
85    handlePageFunction: async ({ request, html, $ }) => {
86      const title = $("title").text();
87
88      const generateResult = () => {
89        const result = JSON.parse(JSON.stringify(templateResult));
90
91        const title = $('#listing_heading h1').first().text().trim();
92        result['Title'] = title;
93
94        $('a.photoswipe_me.img').each((index, el) => {
95          result['All Images'].push($(el).attr("href"));
96        });
97
98        const description = $('.detaildesc').first().text().trim();
99        result['Description'] = description;
100
101        result['Postcode'] = request.userData.postcode;
102
103        result['Rent'] = [];
104        $('.room-list__room').each((index, el) => {
105        	if ($(el).find('s').length == 0) {
106	        	const strPrice = $(el).find('.room-list__price').first().text().trim();
107	        	const strSize = $(el).find('.room-list__room small').first().text().trim(); 
108	        	
109	        	result['Rent'].push({
110    	    		'Price': strPrice.match(/\d+/g)[0],
111        			'Size': strSize.match(/\w+/g)[0], 
112        			'Per': strPrice.match(/(?<=\d )[\w]+/g)[0],
113        		});
114	        }
115        });
116
117        const furnishings = html.match(/(?<=Furnishings<\/dt>\s+<dd class="feature-list__value">).+</g);
118        const parking = html.match(/(?<=Parking<\/dt>\s+<dd class="feature-list__value">).+</g);
119        const garage = html.match(/(?<=Garage<\/dt>\s+<dd class="feature-list__value">).+</g);
120        const garden = html.match(/(?<=Garden\/terrace<\/dt>\s+<dd class="feature-list__value">).+</g);
121        const balcony = html.match(/(?<=Balcony\/Patio<\/dt>\s+<dd class="feature-list__value">).+</g);
122        const disabled = html.match(/(?<=Disabled access<\/dt>\s+<dd class="feature-list__value">).+</g);
123        const living = html.match(/(?<=Living room<\/dt>\s+<dd class="feature-list__value">).+</g);
124        const broadband = html.match(/(?<=Broadband included<\/dt>\s+<dd class="feature-list__value">).+</g);
125        const housemates = html.match(/(?<=# housemates<\/dt>\s+<dd class="feature-list__value">).+</g);
126        const rooms = html.match(/(?<=Total # rooms<\/dt>\s+<dd class="feature-list__value">).+</g);
127        const couples = html.match(/(?<=Couples OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
128        const smoking = html.match(/(?<=Smoking OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
129        const pets = html.match(/(?<=Pets OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
130        const occupation = html.match(/(?<=Occupation<\/dt>\s+<dd class="feature-list__value">).+</g);
131        const benefits = html.match(/(?<=Housing Benefit<\/dt>\s+<dd class="feature-list__value">).+</g);
132        const references = html.match(/(?<=References\?<\/dt>\s+<dd class="feature-list__value">).+</g);
133        const maxage = html.match(/(?<=Max age<\/dt>\s+<dd class="feature-list__value">).+</g);
134        const gender = html.match(/(?<=Gender<\/dt>\s+<dd class="feature-list__value">).+</g);
135
136        result['Furnishings'] = furnishings ? furnishings[0] : null;
137        result['Parking'] = parking ? parking[0] : null;
138        result['Garage'] = garage ? garage[0] : null;
139        result['Garden/terrace'] = garden ? garden[0] : null;
140        result['Balcony/Patio'] = balcony ? balcony[0] : null;
141        result['Disabled access'] = disabled ? disabled[0] : null;
142        result['Living room'] = living ? living[0] : null;
143        result['Broadband'] = broadband ? broadband[0] : null;
144        result['# housemates'] = housemates ? housemates[0] : null;
145        result['Total # rooms'] = rooms ? rooms[0] : null;
146        result['Couples OK?'] = couples ? couples[0] : null;
147        result['Smoking OK?'] = smoking ? smoking[0] : null;
148        result['Pets OK?'] = pets ? pets[0] : null;
149        result['Occupation'] = occupation ? occupation[0] : null;
150        result['Housing Benefit'] = benefits ? benefits[0] : null;
151        result['References?'] = references ? references[0] : null;
152        result['Max age'] = maxage ? maxage[0] : null;
153        result['Gender'] = gender ? gender[0] : null;
154
155        result['Furnishings'] = result['Furnishings'] ? result['Furnishings'].replace('<', '') : null;
156        result['Parking'] = result['Parking'] ? result['Parking'].replace('<', '') : null;
157        result['Garage'] = result['Garage'] ? result['Garage'].replace('<', '') : null;
158        result['Garden/terrace'] = result['Garden/terrace'] ? result['Garden/terrace'].replace('<', '') : null;
159        result['Balcony/Patio'] = result['Balcony/Patio'] ? result['Balcony/Patio'].replace('<', '') : null;
160        result['Disabled access'] = result['Disabled access'] ? result['Disabled access'].replace('<', '') : null;
161        result['Living room'] = result['Living room'] ? result['Living room'].replace('<', '') : null;
162        result['Broadband'] = result['Broadband'] ? result['Broadband'].replace('<', '') : null;
163        result['# housemates'] = result['# housemates'] ? result['# housemates'].replace('<', '') : null;
164        result['Total # rooms'] = result['Total # rooms'] ? result['Total # rooms'].replace('<', '') : null;
165        result['Couples OK?'] = result['Couples OK?'] ? result['Couples OK?'].replace('<', '') : null;
166        result['Smoking OK?'] = result['Smoking OK?'] ? result['Smoking OK?'].replace('<', '') : null;
167        result['Pets OK?'] = result['Pets OK?'] ? result['Pets OK?'].replace('<', '') : null;
168        result['Occupation'] = result['Occupation'] ? result['Occupation'].replace('<', '') : null;
169        result['Max age'] = result['Max age'] ? result['Max age'].replace('<', '') : null;
170        result['Gender'] = result['Gender'] ? result['Gender'].replace('<', '') : null;
171
172        const toClean = [
173        	'Furnishings',
174        	'Parking',
175        	'Garage',
176        	'Garden/terrace',
177        	'Balcony/Patio',
178        	'Disabled access',
179        	'Living room',
180        	'Broadband',
181        	'# housemates',
182        	'Total # rooms',
183        	'Couples OK?',
184        	'Smoking OK?',
185        	'Pets OK?',
186        	'Occupation',
187        	'Max age',
188        	'Gender',
189        	'Housing Benefit',
190        	'References?'
191        ];
192
193        for (const c of toClean){ 
194    		if (result[c]) {
195    			if (result[c].match(/Yes/g))
196    				result[c] = "Yes";
197    			else if (result[c].match(/No/g))
198    				result[c] = "No";
199    			else if (result[c].includes('span') && result[c].match(/(?<=>).+</g))
200    				result[c] = result[c].match(/(?<=>).+</g)[0].replace('<', '');
201    		}
202        }
203
204        result['Living room'] = result['Living room'] ? result['Living room'].replace('span', '').replace('/', '').replace('>', '').replace('<', '') : null;
205        result['Occupation'] = result['Occupation'] ? result['Occupation'] : $('.feature.feature--household-preferences').first().text().match(/(?<=Occupation\s+)(\w|\w )+/g);
206
207        return result;
208      };
209      
210      const output = {
211        "error": null,
212        "data": generateResult(),
213      };
214
215      const run = {
216      	scrapeList: async () => {
217      		$('.listing-result:not(:first-child) header.desktop a').each(async (index, el) => {
218				await requestQueue.addRequest({
219           			url: 'https://www.spareroom.co.uk/' + $(el).attr('href'), 
220            		userData: {
221            			label: 'ONE',
222            			postcode: request.userData.postcode,
223            		}
224        		});
225        	});
226
227      		if ($('#paginationNextPageLink').length != 0)
228      			await requestQueue.addRequest({
229           			url: 'https://www.spareroom.co.uk/flatshare/' +  $('#paginationNextPageLink').first().attr('href'),
230            		userData: {
231            			label: 'LIST',
232            			postcode: request.userData.postcode,
233            		}
234        		});
235
236      	},
237
238      	scrapeOne: async () => {
239      		const output = {
240      			"error": null,
241      			"data": generateResult(),
242      		};
243
244      		console.log(output)
245
246		    await dataset.pushData(output);
247      	},
248      };
249
250      if (request.userData.label === 'LIST') await run.scrapeList();
251      else if (request.userData.label === 'ONE') await run.scrapeOne();
252
253    }
254  });
255
256  await crawler.run();
257});

Pricing

Pricing model

Pay per usage

This Actor is paid per platform usage. The Actor is free to use, and you only pay for the Apify platform usage.