Spareroom
Deprecated
Pricing
Pay per usage
Go to Store
Spareroom
Deprecated
Scrape Spare Room listings by postcodes
0.0 (0)
Pricing
Pay per usage
1
Total users
40
Monthly users
2
Last modified
3 years ago
Dockerfile
# This is a template for a Dockerfile used to run acts in Actor system.# The base image name below is set during the act build, based on user settings.# IMPORTANT: The base image must set a correct working directory, such as /usr/src/app or /home/userFROM apify/actor-node-chrome:v0.21.10
# Second, copy just package.json and package-lock.json since it should be# the only file that affects "npm install" in the next step, to speed up the buildCOPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Copy source code to container# Do this in the last step, to have fast build if only the source code changedCOPY . ./
# NOTE: The CMD is already defined by the base image.# Uncomment this for local node inspector debugging:# CMD [ "node", "--inspect=0.0.0.0:9229", "main.js" ]
package.json
{ "name": "apify-project", "version": "0.0.1", "description": "", "author": "It's not you it's me", "license": "ISC", "dependencies": { "apify": "0.21.10", "moment": "latest", "lodash": "latest" }, "scripts": { "start": "node main.js" }}
main.js
1const Apify = require('apify');2//var moment = require('moment');3//var _ = require('lodash');4
5const templateResult = {6 "Title": '',7 "All Images": [],8 "Description": '',9 "Postcode": null,10 "Rent": null,11
12 "Furnishings": null,13 "Parking": null,14 "Garage": null,15 "Garden/terrace": null,16 "Balcony/Patio": null,17 "Disabled access": null,18 "Living room": null,19 "Broadband": null,20 "# housemates": null, //???21 "Total # rooms": null, //???22 "Couples OK?": null,23 "Smoking OK?": null,24 "Pets OK?": null,25 "Occupation": null,26 "Housing Benefit": null,27 "References?": null,28 "Max age": null,29 "Gender": null,30}31
32
33Apify.main(async () => {34 const requestQueue = await Apify.openRequestQueue('spareroom');35 const requestQueueP = await Apify.openRequestQueue('spareroomP');36 const dataset = await Apify.openDataset('spareroom');37
38 const input = await Apify.getInput();39
40 await requestQueueP.addRequest({41 url: 'https://www.spareroom.co.uk/',42 userData: {43 postcodes: input.postcodes,44 }45 });46
47 let urlsLists = [ ];48 const pptrCrawler = new Apify.PuppeteerCrawler({49 requestQueue: requestQueueP,50 handlePageFunction: async ({ request, page }) => {51 for (const postcode of request.userData.postcodes) {52 const selectorSearch = '#search_by_location_field';53 const selectorSearchSubmit = '#search_by_location_submit_button';54 const selectorLoaded = '.listing-result';55
56 if (page.$(selectorSearch) === null)57 await page.waitForSelector(selectorSearch);58
59 await page.focus(selectorSearch);60 await page.keyboard.type(postcode);61 await page.click(selectorSearchSubmit);62 63 await page.waitForSelector(selectorLoaded);64 urlsLists.push({ url: await page.url(), postcode });65
66 await page.goBack();67 }68 }69 });70
71 await pptrCrawler.run();72
73 for (let url of urlsLists)74 await requestQueue.addRequest({75 url: url.url,76 userData: {77 postcode: url.postcode,78 label: "LIST"79 }80 });81
82
83 const crawler = new Apify.CheerioCrawler({84 requestQueue,85 handlePageFunction: async ({ request, html, $ }) => {86 const title = $("title").text();87
88 const generateResult = () => {89 const result = JSON.parse(JSON.stringify(templateResult));90
91 const title = $('#listing_heading h1').first().text().trim();92 result['Title'] = title;93
94 $('a.photoswipe_me.img').each((index, el) => {95 result['All Images'].push($(el).attr("href"));96 });97
98 const description = $('.detaildesc').first().text().trim();99 result['Description'] = description;100
101 result['Postcode'] = request.userData.postcode;102
103 result['Rent'] = [];104 $('.room-list__room').each((index, el) => {105 if ($(el).find('s').length == 0) {106 const strPrice = $(el).find('.room-list__price').first().text().trim();107 const strSize = $(el).find('.room-list__room small').first().text().trim(); 108 109 result['Rent'].push({110 'Price': strPrice.match(/\d+/g)[0],111 'Size': strSize.match(/\w+/g)[0], 112 'Per': strPrice.match(/(?<=\d )[\w]+/g)[0],113 });114 }115 });116
117 const furnishings = html.match(/(?<=Furnishings<\/dt>\s+<dd class="feature-list__value">).+</g);118 const parking = html.match(/(?<=Parking<\/dt>\s+<dd class="feature-list__value">).+</g);119 const garage = html.match(/(?<=Garage<\/dt>\s+<dd class="feature-list__value">).+</g);120 const garden = html.match(/(?<=Garden\/terrace<\/dt>\s+<dd class="feature-list__value">).+</g);121 const balcony = html.match(/(?<=Balcony\/Patio<\/dt>\s+<dd class="feature-list__value">).+</g);122 const disabled = html.match(/(?<=Disabled access<\/dt>\s+<dd class="feature-list__value">).+</g);123 const living = html.match(/(?<=Living room<\/dt>\s+<dd class="feature-list__value">).+</g);124 const broadband = html.match(/(?<=Broadband included<\/dt>\s+<dd class="feature-list__value">).+</g);125 const housemates = html.match(/(?<=# housemates<\/dt>\s+<dd class="feature-list__value">).+</g);126 const rooms = html.match(/(?<=Total # rooms<\/dt>\s+<dd class="feature-list__value">).+</g);127 const couples = html.match(/(?<=Couples OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);128 const smoking = html.match(/(?<=Smoking OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);129 const pets = html.match(/(?<=Pets OK\?<\/dt>\s+<dd class="feature-list__value">).+</g);130 const occupation = html.match(/(?<=Occupation<\/dt>\s+<dd class="feature-list__value">).+</g);131 const benefits = html.match(/(?<=Housing Benefit<\/dt>\s+<dd class="feature-list__value">).+</g);132 const references = html.match(/(?<=References\?<\/dt>\s+<dd class="feature-list__value">).+</g);133 const maxage = html.match(/(?<=Max age<\/dt>\s+<dd class="feature-list__value">).+</g);134 const gender = html.match(/(?<=Gender<\/dt>\s+<dd class="feature-list__value">).+</g);135
136 result['Furnishings'] = furnishings ? furnishings[0] : null;137 result['Parking'] = parking ? parking[0] : null;138 result['Garage'] = garage ? garage[0] : null;139 result['Garden/terrace'] = garden ? garden[0] : null;140 result['Balcony/Patio'] = balcony ? balcony[0] : null;141 result['Disabled access'] = disabled ? disabled[0] : null;142 result['Living room'] = living ? living[0] : null;143 result['Broadband'] = broadband ? broadband[0] : null;144 result['# housemates'] = housemates ? housemates[0] : null;145 result['Total # rooms'] = rooms ? rooms[0] : null;146 result['Couples OK?'] = couples ? couples[0] : null;147 result['Smoking OK?'] = smoking ? smoking[0] : null;148 result['Pets OK?'] = pets ? pets[0] : null;149 result['Occupation'] = occupation ? occupation[0] : null;150 result['Housing Benefit'] = benefits ? benefits[0] : null;151 result['References?'] = references ? references[0] : null;152 result['Max age'] = maxage ? maxage[0] : null;153 result['Gender'] = gender ? gender[0] : null;154
155 result['Furnishings'] = result['Furnishings'] ? result['Furnishings'].replace('<', '') : null;156 result['Parking'] = result['Parking'] ? result['Parking'].replace('<', '') : null;157 result['Garage'] = result['Garage'] ? result['Garage'].replace('<', '') : null;158 result['Garden/terrace'] = result['Garden/terrace'] ? result['Garden/terrace'].replace('<', '') : null;159 result['Balcony/Patio'] = result['Balcony/Patio'] ? result['Balcony/Patio'].replace('<', '') : null;160 result['Disabled access'] = result['Disabled access'] ? result['Disabled access'].replace('<', '') : null;161 result['Living room'] = result['Living room'] ? result['Living room'].replace('<', '') : null;162 result['Broadband'] = result['Broadband'] ? result['Broadband'].replace('<', '') : null;163 result['# housemates'] = result['# housemates'] ? result['# housemates'].replace('<', '') : null;164 result['Total # rooms'] = result['Total # rooms'] ? result['Total # rooms'].replace('<', '') : null;165 result['Couples OK?'] = result['Couples OK?'] ? result['Couples OK?'].replace('<', '') : null;166 result['Smoking OK?'] = result['Smoking OK?'] ? result['Smoking OK?'].replace('<', '') : null;167 result['Pets OK?'] = result['Pets OK?'] ? result['Pets OK?'].replace('<', '') : null;168 result['Occupation'] = result['Occupation'] ? result['Occupation'].replace('<', '') : null;169 result['Max age'] = result['Max age'] ? result['Max age'].replace('<', '') : null;170 result['Gender'] = result['Gender'] ? result['Gender'].replace('<', '') : null;171
172 const toClean = [173 'Furnishings',174 'Parking',175 'Garage',176 'Garden/terrace',177 'Balcony/Patio',178 'Disabled access',179 'Living room',180 'Broadband',181 '# housemates',182 'Total # rooms',183 'Couples OK?',184 'Smoking OK?',185 'Pets OK?',186 'Occupation',187 'Max age',188 'Gender',189 'Housing Benefit',190 'References?'191 ];192
193 for (const c of toClean){ 194 if (result[c]) {195 if (result[c].match(/Yes/g))196 result[c] = "Yes";197 else if (result[c].match(/No/g))198 result[c] = "No";199 else if (result[c].includes('span') && result[c].match(/(?<=>).+</g))200 result[c] = result[c].match(/(?<=>).+</g)[0].replace('<', '');201 }202 }203
204 result['Living room'] = result['Living room'] ? result['Living room'].replace('span', '').replace('/', '').replace('>', '').replace('<', '') : null;205 result['Occupation'] = result['Occupation'] ? result['Occupation'] : $('.feature.feature--household-preferences').first().text().match(/(?<=Occupation\s+)(\w|\w )+/g);206
207 return result;208 };209 210 const output = {211 "error": null,212 "data": generateResult(),213 };214
215 const run = {216 scrapeList: async () => {217 $('.listing-result:not(:first-child) header.desktop a').each(async (index, el) => {218 await requestQueue.addRequest({219 url: 'https://www.spareroom.co.uk/' + $(el).attr('href'), 220 userData: {221 label: 'ONE',222 postcode: request.userData.postcode,223 }224 });225 });226
227 if ($('#paginationNextPageLink').length != 0)228 await requestQueue.addRequest({229 url: 'https://www.spareroom.co.uk/flatshare/' + $('#paginationNextPageLink').first().attr('href'),230 userData: {231 label: 'LIST',232 postcode: request.userData.postcode,233 }234 });235
236 },237
238 scrapeOne: async () => {239 const output = {240 "error": null,241 "data": generateResult(),242 };243
244 console.log(output)245
246 await dataset.pushData(output);247 },248 };249
250 if (request.userData.label === 'LIST') await run.scrapeList();251 else if (request.userData.label === 'ONE') await run.scrapeOne();252
253 }254 });255
256 await crawler.run();257});