Spareroom avatar
Spareroom

Deprecated

Pricing

Pay per usage

Go to Store
Spareroom

Spareroom

Deprecated

Developed by

Nikolajus Elmutis

Nikolajus Elmutis

Maintained by Community

Scrape Spare Room listings by postcodes

0.0 (0)

Pricing

Pay per usage

1

Total users

40

Monthly users

2

Last modified

3 years ago

Dockerfile

# This is a template for a Dockerfile used to run acts in Actor system.
# The base image name below is set during the act build, based on user settings.
# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
FROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be
# the only file that affects "npm install" in the next step, to speed up the build
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Copy source code to container
# Do this in the last step, to have fast build if only the source code changed
COPY --chown=myuser:myuser . ./
# NOTE: The CMD is already defined by the base image.
# Uncomment this for local node inspector debugging:
# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]

package.json

{
"name": "apify-project",
"version": "0.0.1",
"description": "",
"author": "It's not you it's me",
"license": "ISC",
"dependencies": {
"apify": "0.21.10",
"moment": "latest",
"lodash": "latest"
},
"scripts": {
"start": "node main.js"
}
}

main.js

1const Apify = require('apify');
2//var moment = require('moment');
3//var _ = require('lodash');
4
5const templateResult = {
6 "Title": '',
7 "All Images": [],
8 "Description": '',
9 "Postcode": null,
10 "Rent": null,
11
12 "Furnishings": null,
13 "Parking": null,
14 "Garage": null,
15 "Garden/terrace": null,
16 "Balcony/Patio": null,
17 "Disabled access": null,
18 "Living room": null,
19 "Broadband": null,
20 "# housemates": null, //???
21 "Total # rooms": null, //???
22 "Couples OK?": null,
23 "Smoking OK?": null,
24 "Pets OK?": null,
25 "Occupation": null,
26 "Housing Benefit": null,
27 "References?": null,
28 "Max age": null,
29 "Gender": null,
30}
31
32
33Apify.main(async () => {
34 const requestQueue = await Apify.openRequestQueue('spareroom');
35 const requestQueueP = await Apify.openRequestQueue('spareroomP');
36 const dataset = await Apify.openDataset('spareroom');
37
38 const input = await Apify.getInput();
39
40 await requestQueueP.addRequest({
41 url: 'https://www.spareroom.co.uk/',
42 userData: {
43 postcodes: input.postcodes,
44 }
45 });
46
47 let urlsLists = [ ];
48 const pptrCrawler = new Apify.PuppeteerCrawler({
49 requestQueue: requestQueueP,
50 handlePageFunction: async ({ request, page }) => {
51 for (const postcode of request.userData.postcodes) {
52 const selectorSearch = '#search_by_location_field';
53 const selectorSearchSubmit = '#search_by_location_submit_button';
54 const selectorLoaded = '.listing-result';
55
56 if (page.$(selectorSearch) === null)
57 await page.waitForSelector(selectorSearch);
58
59 await page.focus(selectorSearch);
60 await page.keyboard.type(postcode);
61 await page.click(selectorSearchSubmit);
62
63 await page.waitForSelector(selectorLoaded);
64 urlsLists.push({ url: await page.url(), postcode });
65
66 await page.goBack();
67 }
68 }
69 });
70
71 await pptrCrawler.run();
72
73 for (let url of urlsLists)
74 await requestQueue.addRequest({
75 url: url.url,
76 userData: {
77 postcode: url.postcode,
78 label: "LIST"
79 }
80 });
81
82
83 const crawler = new Apify.CheerioCrawler({
84 requestQueue,
85 handlePageFunction: async ({ request, html, $ }) => {
86 const title = $("title").text();
87
88 const generateResult = () => {
89 const result = JSON.parse(JSON.stringify(templateResult));
90
91 const title = $('#listing_heading h1').first().text().trim();
92 result['Title'] = title;
93
94 $('a.photoswipe_me.img').each((index, el) => {
95 result['All Images'].push($(el).attr("href"));
96 });
97
98 const description = $('.detaildesc').first().text().trim();
99 result['Description'] = description;
100
101 result['Postcode'] = request.userData.postcode;
102
103 result['Rent'] = [];
104 $('.room-list__room').each((index, el) => {
105 if ($(el).find('s').length == 0) {
106 const strPrice = $(el).find('.room-list__price').first().text().trim();
107 const strSize = $(el).find('.room-list__room small').first().text().trim();
108
109 result['Rent'].push({
110 'Price': strPrice.match(/\d+/g)[0],
111 'Size': strSize.match(/\w+/g)[0],
112 'Per': strPrice.match(/(?<=\d )[\w]+/g)[0],
113 });
114 }
115 });
116
117 const furnishings = html.match(/(?<=Furnishings<\/dt>\s+<dd class="feature-list__value">).+</g);
118 const parking = html.match(/(?<=Parking<\/dt>\s+<dd class="feature-list__value">).+</g);
119 const garage = html.match(/(?<=Garage<\/dt>\s+<dd class="feature-list__value">).+</g);
120 const garden = html.match(/(?<=Garden\/terrace<\/dt>\s+<dd class="feature-list__value">).+</g);
121 const balcony = html.match(/(?<=Balcony\/Patio<\/dt>\s+<dd class="feature-list__value">).+</g);
122 const disabled = html.match(/(?<=Disabled access<\/dt>\s+<dd class="feature-list__value">).+</g);
123 const living = html.match(/(?<=Living room<\/dt>\s+<dd class="feature-list__value">).+</g);
124 const broadband = html.match(/(?<=Broadband included<\/dt>\s+<dd class="feature-list__value">).+</g);
125 const housemates = html.match(/(?<=# housemates<\/dt>\s+<dd class="feature-list__value">).+</g);
126 const rooms = html.match(/(?<=Total # rooms<\/dt>\s+<dd class="feature-list__value">).+</g);
127 const couples = html.match(/(?<=Couples OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
128 const smoking = html.match(/(?<=Smoking OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
129 const pets = html.match(/(?<=Pets OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
130 const occupation = html.match(/(?<=Occupation<\/dt>\s+<dd class="feature-list__value">).+</g);
131 const benefits = html.match(/(?<=Housing Benefit<\/dt>\s+<dd class="feature-list__value">).+</g);
132 const references = html.match(/(?<=References\?<\/dt>\s+<dd class="feature-list__value">).+</g);
133 const maxage = html.match(/(?<=Max age<\/dt>\s+<dd class="feature-list__value">).+</g);
134 const gender = html.match(/(?<=Gender<\/dt>\s+<dd class="feature-list__value">).+</g);
135
136 result['Furnishings'] = furnishings ? furnishings[0] : null;
137 result['Parking'] = parking ? parking[0] : null;
138 result['Garage'] = garage ? garage[0] : null;
139 result['Garden/terrace'] = garden ? garden[0] : null;
140 result['Balcony/Patio'] = balcony ? balcony[0] : null;
141 result['Disabled access'] = disabled ? disabled[0] : null;
142 result['Living room'] = living ? living[0] : null;
143 result['Broadband'] = broadband ? broadband[0] : null;
144 result['# housemates'] = housemates ? housemates[0] : null;
145 result['Total # rooms'] = rooms ? rooms[0] : null;
146 result['Couples OK?'] = couples ? couples[0] : null;
147 result['Smoking OK?'] = smoking ? smoking[0] : null;
148 result['Pets OK?'] = pets ? pets[0] : null;
149 result['Occupation'] = occupation ? occupation[0] : null;
150 result['Housing Benefit'] = benefits ? benefits[0] : null;
151 result['References?'] = references ? references[0] : null;
152 result['Max age'] = maxage ? maxage[0] : null;
153 result['Gender'] = gender ? gender[0] : null;
154
155 result['Furnishings'] = result['Furnishings'] ? result['Furnishings'].replace('<', '') : null;
156 result['Parking'] = result['Parking'] ? result['Parking'].replace('<', '') : null;
157 result['Garage'] = result['Garage'] ? result['Garage'].replace('<', '') : null;
158 result['Garden/terrace'] = result['Garden/terrace'] ? result['Garden/terrace'].replace('<', '') : null;
159 result['Balcony/Patio'] = result['Balcony/Patio'] ? result['Balcony/Patio'].replace('<', '') : null;
160 result['Disabled access'] = result['Disabled access'] ? result['Disabled access'].replace('<', '') : null;
161 result['Living room'] = result['Living room'] ? result['Living room'].replace('<', '') : null;
162 result['Broadband'] = result['Broadband'] ? result['Broadband'].replace('<', '') : null;
163 result['# housemates'] = result['# housemates'] ? result['# housemates'].replace('<', '') : null;
164 result['Total # rooms'] = result['Total # rooms'] ? result['Total # rooms'].replace('<', '') : null;
165 result['Couples OK?'] = result['Couples OK?'] ? result['Couples OK?'].replace('<', '') : null;
166 result['Smoking OK?'] = result['Smoking OK?'] ? result['Smoking OK?'].replace('<', '') : null;
167 result['Pets OK?'] = result['Pets OK?'] ? result['Pets OK?'].replace('<', '') : null;
168 result['Occupation'] = result['Occupation'] ? result['Occupation'].replace('<', '') : null;
169 result['Max age'] = result['Max age'] ? result['Max age'].replace('<', '') : null;
170 result['Gender'] = result['Gender'] ? result['Gender'].replace('<', '') : null;
171
172 const toClean = [
173 'Furnishings',
174 'Parking',
175 'Garage',
176 'Garden/terrace',
177 'Balcony/Patio',
178 'Disabled access',
179 'Living room',
180 'Broadband',
181 '# housemates',
182 'Total # rooms',
183 'Couples OK?',
184 'Smoking OK?',
185 'Pets OK?',
186 'Occupation',
187 'Max age',
188 'Gender',
189 'Housing Benefit',
190 'References?'
191 ];
192
193 for (const c of toClean){
194 if (result[c]) {
195 if (result[c].match(/Yes/g))
196 result[c] = "Yes";
197 else if (result[c].match(/No/g))
198 result[c] = "No";
199 else if (result[c].includes('span') && result[c].match(/(?<=>).+</g))
200 result[c] = result[c].match(/(?<=>).+</g)[0].replace('<', '');
201 }
202 }
203
204 result['Living room'] = result['Living room'] ? result['Living room'].replace('span', '').replace('/', '').replace('>', '').replace('<', '') : null;
205 result['Occupation'] = result['Occupation'] ? result['Occupation'] : $('.feature.feature--household-preferences').first().text().match(/(?<=Occupation\s+)(\w|\w )+/g);
206
207 return result;
208 };
209
210 const output = {
211 "error": null,
212 "data": generateResult(),
213 };
214
215 const run = {
216 scrapeList: async () => {
217 $('.listing-result:not(:first-child) header.desktop a').each(async (index, el) => {
218 await requestQueue.addRequest({
219 url: 'https://www.spareroom.co.uk/' + $(el).attr('href'),
220 userData: {
221 label: 'ONE',
222 postcode: request.userData.postcode,
223 }
224 });
225 });
226
227 if ($('#paginationNextPageLink').length != 0)
228 await requestQueue.addRequest({
229 url: 'https://www.spareroom.co.uk/flatshare/' + $('#paginationNextPageLink').first().attr('href'),
230 userData: {
231 label: 'LIST',
232 postcode: request.userData.postcode,
233 }
234 });
235
236 },
237
238 scrapeOne: async () => {
239 const output = {
240 "error": null,
241 "data": generateResult(),
242 };
243
244 console.log(output)
245
246 await dataset.pushData(output);
247 },
248 };
249
250 if (request.userData.label === 'LIST') await run.scrapeList();
251 else if (request.userData.label === 'ONE') await run.scrapeOne();
252
253 }
254 });
255
256 await crawler.run();
257});