Spareroom
View all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsSpareroom
zyberg/spareroom
Scrape Spare Room listings by postcodes
Dockerfile
1# This is a template for a Dockerfile used to run acts in Actor system.
2# The base image name below is set during the act build, based on user settings.
3# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/user
4FROM apify/actor-node-chrome:v0.21.10
5
6# Second, copy just package.json and package-lock.json since it should be
7# the only file that affects "npm install" in the next step, to speed up the build
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Copy source code to container
23# Do this in the last step, to have fast build if only the source code changed
24COPY . ./
25
26# NOTE: The CMD is already defined by the base image.
27# Uncomment this for local node inspector debugging:
28# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
1{
2 "name": "apify-project",
3 "version": "0.0.1",
4 "description": "",
5 "author": "It's not you it's me",
6 "license": "ISC",
7 "dependencies": {
8 "apify": "0.21.10",
9 "moment": "latest",
10 "lodash": "latest"
11 },
12 "scripts": {
13 "start": "node main.js"
14 }
15}
main.js
1const Apify = require('apify');
2//var moment = require('moment');
3//var _ = require('lodash');
4
5const templateResult = {
6 "Title": '',
7 "All Images": [],
8 "Description": '',
9 "Postcode": null,
10 "Rent": null,
11
12 "Furnishings": null,
13 "Parking": null,
14 "Garage": null,
15 "Garden/terrace": null,
16 "Balcony/Patio": null,
17 "Disabled access": null,
18 "Living room": null,
19 "Broadband": null,
20 "# housemates": null, //???
21 "Total # rooms": null, //???
22 "Couples OK?": null,
23 "Smoking OK?": null,
24 "Pets OK?": null,
25 "Occupation": null,
26 "Housing Benefit": null,
27 "References?": null,
28 "Max age": null,
29 "Gender": null,
30}
31
32
33Apify.main(async () => {
34 const requestQueue = await Apify.openRequestQueue('spareroom');
35 const requestQueueP = await Apify.openRequestQueue('spareroomP');
36 const dataset = await Apify.openDataset('spareroom');
37
38 const input = await Apify.getInput();
39
40 await requestQueueP.addRequest({
41 url: 'https://www.spareroom.co.uk/',
42 userData: {
43 postcodes: input.postcodes,
44 }
45 });
46
47 let urlsLists = [ ];
48 const pptrCrawler = new Apify.PuppeteerCrawler({
49 requestQueue: requestQueueP,
50 handlePageFunction: async ({ request, page }) => {
51 for (const postcode of request.userData.postcodes) {
52 const selectorSearch = '#search_by_location_field';
53 const selectorSearchSubmit = '#search_by_location_submit_button';
54 const selectorLoaded = '.listing-result';
55
56 if (page.$(selectorSearch) === null)
57 await page.waitForSelector(selectorSearch);
58
59 await page.focus(selectorSearch);
60 await page.keyboard.type(postcode);
61 await page.click(selectorSearchSubmit);
62
63 await page.waitForSelector(selectorLoaded);
64 urlsLists.push({ url: await page.url(), postcode });
65
66 await page.goBack();
67 }
68 }
69 });
70
71 await pptrCrawler.run();
72
73 for (let url of urlsLists)
74 await requestQueue.addRequest({
75 url: url.url,
76 userData: {
77 postcode: url.postcode,
78 label: "LIST"
79 }
80 });
81
82
83 const crawler = new Apify.CheerioCrawler({
84 requestQueue,
85 handlePageFunction: async ({ request, html, $ }) => {
86 const title = $("title").text();
87
88 const generateResult = () => {
89 const result = JSON.parse(JSON.stringify(templateResult));
90
91 const title = $('#listing_heading h1').first().text().trim();
92 result['Title'] = title;
93
94 $('a.photoswipe_me.img').each((index, el) => {
95 result['All Images'].push($(el).attr("href"));
96 });
97
98 const description = $('.detaildesc').first().text().trim();
99 result['Description'] = description;
100
101 result['Postcode'] = request.userData.postcode;
102
103 result['Rent'] = [];
104 $('.room-list__room').each((index, el) => {
105 if ($(el).find('s').length == 0) {
106 const strPrice = $(el).find('.room-list__price').first().text().trim();
107 const strSize = $(el).find('.room-list__room small').first().text().trim();
108
109 result['Rent'].push({
110 'Price': strPrice.match(/\d+/g)[0],
111 'Size': strSize.match(/\w+/g)[0],
112 'Per': strPrice.match(/(?<=\d )[\w]+/g)[0],
113 });
114 }
115 });
116
117 const furnishings = html.match(/(?<=Furnishings<\/dt>\s+<dd class="feature-list__value">).+</g);
118 const parking = html.match(/(?<=Parking<\/dt>\s+<dd class="feature-list__value">).+</g);
119 const garage = html.match(/(?<=Garage<\/dt>\s+<dd class="feature-list__value">).+</g);
120 const garden = html.match(/(?<=Garden\/terrace<\/dt>\s+<dd class="feature-list__value">).+</g);
121 const balcony = html.match(/(?<=Balcony\/Patio<\/dt>\s+<dd class="feature-list__value">).+</g);
122 const disabled = html.match(/(?<=Disabled access<\/dt>\s+<dd class="feature-list__value">).+</g);
123 const living = html.match(/(?<=Living room<\/dt>\s+<dd class="feature-list__value">).+</g);
124 const broadband = html.match(/(?<=Broadband included<\/dt>\s+<dd class="feature-list__value">).+</g);
125 const housemates = html.match(/(?<=# housemates<\/dt>\s+<dd class="feature-list__value">).+</g);
126 const rooms = html.match(/(?<=Total # rooms<\/dt>\s+<dd class="feature-list__value">).+</g);
127 const couples = html.match(/(?<=Couples OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
128 const smoking = html.match(/(?<=Smoking OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
129 const pets = html.match(/(?<=Pets OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);
130 const occupation = html.match(/(?<=Occupation<\/dt>\s+<dd class="feature-list__value">).+</g);
131 const benefits = html.match(/(?<=Housing Benefit<\/dt>\s+<dd class="feature-list__value">).+</g);
132 const references = html.match(/(?<=References\?<\/dt>\s+<dd class="feature-list__value">).+</g);
133 const maxage = html.match(/(?<=Max age<\/dt>\s+<dd class="feature-list__value">).+</g);
134 const gender = html.match(/(?<=Gender<\/dt>\s+<dd class="feature-list__value">).+</g);
135
136 result['Furnishings'] = furnishings ? furnishings[0] : null;
137 result['Parking'] = parking ? parking[0] : null;
138 result['Garage'] = garage ? garage[0] : null;
139 result['Garden/terrace'] = garden ? garden[0] : null;
140 result['Balcony/Patio'] = balcony ? balcony[0] : null;
141 result['Disabled access'] = disabled ? disabled[0] : null;
142 result['Living room'] = living ? living[0] : null;
143 result['Broadband'] = broadband ? broadband[0] : null;
144 result['# housemates'] = housemates ? housemates[0] : null;
145 result['Total # rooms'] = rooms ? rooms[0] : null;
146 result['Couples OK?'] = couples ? couples[0] : null;
147 result['Smoking OK?'] = smoking ? smoking[0] : null;
148 result['Pets OK?'] = pets ? pets[0] : null;
149 result['Occupation'] = occupation ? occupation[0] : null;
150 result['Housing Benefit'] = benefits ? benefits[0] : null;
151 result['References?'] = references ? references[0] : null;
152 result['Max age'] = maxage ? maxage[0] : null;
153 result['Gender'] = gender ? gender[0] : null;
154
155 result['Furnishings'] = result['Furnishings'] ? result['Furnishings'].replace('<', '') : null;
156 result['Parking'] = result['Parking'] ? result['Parking'].replace('<', '') : null;
157 result['Garage'] = result['Garage'] ? result['Garage'].replace('<', '') : null;
158 result['Garden/terrace'] = result['Garden/terrace'] ? result['Garden/terrace'].replace('<', '') : null;
159 result['Balcony/Patio'] = result['Balcony/Patio'] ? result['Balcony/Patio'].replace('<', '') : null;
160 result['Disabled access'] = result['Disabled access'] ? result['Disabled access'].replace('<', '') : null;
161 result['Living room'] = result['Living room'] ? result['Living room'].replace('<', '') : null;
162 result['Broadband'] = result['Broadband'] ? result['Broadband'].replace('<', '') : null;
163 result['# housemates'] = result['# housemates'] ? result['# housemates'].replace('<', '') : null;
164 result['Total # rooms'] = result['Total # rooms'] ? result['Total # rooms'].replace('<', '') : null;
165 result['Couples OK?'] = result['Couples OK?'] ? result['Couples OK?'].replace('<', '') : null;
166 result['Smoking OK?'] = result['Smoking OK?'] ? result['Smoking OK?'].replace('<', '') : null;
167 result['Pets OK?'] = result['Pets OK?'] ? result['Pets OK?'].replace('<', '') : null;
168 result['Occupation'] = result['Occupation'] ? result['Occupation'].replace('<', '') : null;
169 result['Max age'] = result['Max age'] ? result['Max age'].replace('<', '') : null;
170 result['Gender'] = result['Gender'] ? result['Gender'].replace('<', '') : null;
171
172 const toClean = [
173 'Furnishings',
174 'Parking',
175 'Garage',
176 'Garden/terrace',
177 'Balcony/Patio',
178 'Disabled access',
179 'Living room',
180 'Broadband',
181 '# housemates',
182 'Total # rooms',
183 'Couples OK?',
184 'Smoking OK?',
185 'Pets OK?',
186 'Occupation',
187 'Max age',
188 'Gender',
189 'Housing Benefit',
190 'References?'
191 ];
192
193 for (const c of toClean){
194 if (result[c]) {
195 if (result[c].match(/Yes/g))
196 result[c] = "Yes";
197 else if (result[c].match(/No/g))
198 result[c] = "No";
199 else if (result[c].includes('span') && result[c].match(/(?<=>).+</g))
200 result[c] = result[c].match(/(?<=>).+</g)[0].replace('<', '');
201 }
202 }
203
204 result['Living room'] = result['Living room'] ? result['Living room'].replace('span', '').replace('/', '').replace('>', '').replace('<', '') : null;
205 result['Occupation'] = result['Occupation'] ? result['Occupation'] : $('.feature.feature--household-preferences').first().text().match(/(?<=Occupation\s+)(\w|\w )+/g);
206
207 return result;
208 };
209
210 const output = {
211 "error": null,
212 "data": generateResult(),
213 };
214
215 const run = {
216 scrapeList: async () => {
217 $('.listing-result:not(:first-child) header.desktop a').each(async (index, el) => {
218 await requestQueue.addRequest({
219 url: 'https://www.spareroom.co.uk/' + $(el).attr('href'),
220 userData: {
221 label: 'ONE',
222 postcode: request.userData.postcode,
223 }
224 });
225 });
226
227 if ($('#paginationNextPageLink').length != 0)
228 await requestQueue.addRequest({
229 url: 'https://www.spareroom.co.uk/flatshare/' + $('#paginationNextPageLink').first().attr('href'),
230 userData: {
231 label: 'LIST',
232 postcode: request.userData.postcode,
233 }
234 });
235
236 },
237
238 scrapeOne: async () => {
239 const output = {
240 "error": null,
241 "data": generateResult(),
242 };
243
244 console.log(output)
245
246 await dataset.pushData(output);
247 },
248 };
249
250 if (request.userData.label === 'LIST') await run.scrapeList();
251 else if (request.userData.label === 'ONE') await run.scrapeOne();
252
253 }
254 });
255
256 await crawler.run();
257});
Developer
Maintained by Community
Categories