Softwareadvicescrapereviews

No credit card required

Softwareadvicescrapereviews

Softwareadvicescrapereviews

undrtkr984/softwareadvicescrapereviews

No credit card required

Dockerfile

1# First, specify the base Docker image. You can read more about 2# the available images at https://sdk.apify.com/docs/guides/docker-images 3# You can also use any other image from Docker Hub. 4FROM apify/actor-node:16 5 6# Second, copy just package.json and package-lock.json since those are the only 7# files that affect "npm install" in the next step, to speed up the build. 8COPY package*.json ./ 9 10# Install NPM packages, skip optional and development dependencies to 11# keep the image small. Avoid logging too much and print the dependency 12# tree for debugging 13RUN npm --quiet set progress=false \ 14 && npm install --only=prod --no-optional \ 15 && echo "Installed NPM packages:" \ 16 && (npm list || true) \ 17 && echo "Node.js version:" \ 18 && node --version \ 19 && echo "NPM version:" \ 20 && npm --version 21 22# Next, copy the remaining files and directories with the source code. 23# Since we do this after NPM install, quick build will be really fast 24# for most source file changes. 25COPY . ./ 26 27# Optionally, specify how to launch the source code of your actor. 28# By default, Apify's base Docker images define the CMD instruction 29# that runs the Node.js source code using the command specified 30# in the "scripts.start" section of the package.json file. 31# In short, the instruction looks something like this: 32# 33# CMD npm start 34

INPUT_SCHEMA.json

1{ 2 "title": "Input Schema", 3 "description": "To update crawler to another site, you need to change startUrls options!", 4 "type": "object", 5 "schemaVersion": 1, 6 "properties": { 7 "startUrls": { 8 "title": "Start URLs", 9 "type": "array", 10 "description": "A static list of URLs to scrape. It is recommended to only use one start url. Just navigate to a company/product page and grab the url like so (ex: https://www.softwareadvice.com/fleet-management/gps-insight-profile/) and use that as the start URL.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.", 11 "prefill": [{ "url": "https://www.softwareadvice.com/fleet-management/gps-insight-profile/" }], 12 "editor": "requestListSources" 13 }, 14 "maxRequestRetries": { 15 "title": "Max page retries", 16 "type": "integer", 17 "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.", 18 "minimum": 0, 19 "prefill": 2, 20 "default": 2 21 } 22 }, 23 "required": ["startUrls"] 24} 25

main.js

1// This is the main Node.js source code file of your actor. 2// It is referenced from the "scripts" section of the package.json file. 3 4const Apify = require('apify'); 5 6Apify.main(async () => { 7 // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file. 8 // For more information, see https://docs.apify.com/actors/development/input-schema 9 const input = await Apify.getInput(); 10 console.log('Input:'); 11 console.dir(input); 12 13 // Here you can prepare your input for actor apify/web-scraper this input is based on a actor 14 // task you used as the starting point. 15 const metamorphInput = { 16 "breakpointLocation": "NONE", 17 "browserLog": false, 18 "debugLog": false, 19 "downloadCss": true, 20 "downloadMedia": true, 21 "ignoreCorsAndCsp": false, 22 "ignoreSslErrors": false, 23 "injectJQuery": true, 24 "keepUrlFragments": false, 25 "maxRequestRetries": input.maxRequestRetries, 26 "pageFunction": // The function accepts a single argument: the "context" object. 27 // For a complete list of its properties and functions, 28 // see https://apify.com/apify/web-scraper#page-function 29 async function pageFunction(context) { 30 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT! 31 // debugger; 32 // jQuery is handy for finding DOM elements and extracting data from them. 33 // To use it, make sure to enable the "Inject jQuery" option. 34 const $ = context.jQuery; 35 36 let timeoutMillis; // undefined 37 await context.waitFor(1000); 38 // 2 sec timeout after the first. 39 // get elements contained the wrapper class 40 var node = document.querySelector(".wrapper") 41 var href = ""; 42 // if this is not the reviews page, check to see if it has a full reviews page 43 if (!context.request.url.endsWith("/reviews/")){ 44 context.log.info(context.request.url); 45 46 for (let i= 0; i < node.childNodes.length; i++) 47 { 48 if(node.childNodes[i].childNodes[0].innerText == 'Reviews'){ 49 50 try{ 51 href = node.childNodes[i].childNodes[0].href; 52 }catch{} 53 break; 54 } 55 } 56 context.log.info(href) 57 // if the a tag had an href attribute, that means this has a full reviews page, so we will navigate there to get all the reviews 58 if(href != ""){ 59 await context.enqueueRequest({ url: href }); 60 return; 61 } 62 } 63 64 await context.waitFor(1500); 65 // scrape all the reviews 66 var fullReviewData = $("[data-testid='reviews-container']"); 67 68 69 var results = []; 70 do{ 71 for (let i = 0; i < fullReviewData.length; i++ ) 72 { 73 var score = ""; 74 var companySize = ""; 75 var industry = ""; 76 var timeUsed = ""; 77 var reviewSource = ""; 78 var date = ""; 79 var title = ""; 80 var summary = ""; 81 var pros = ""; 82 var cons = ""; 83 var date = ""; 84 85 try{ 86 score = fullReviewData[i].querySelectorAll("[data-testid='reviewers-rating'] .OverallStarRatingComponent .fullStar").length; 87 }catch(error){} 88 89 try{ 90 companySize = fullReviewData[i].querySelector('div.review-company > p > strong').innerText; 91 } 92 catch(error){} 93 94 95 96 try{ 97 industry = fullReviewData[i].querySelector('div.review-gdm-industry > p > strong').innerText 98 } 99 catch(error){ 100 101 } 102 try{ 103 timeUsed = fullReviewData[i].querySelector('div.review-profile-time-used > p > strong').innerText 104 } 105 catch(error){} 106 try{ 107 reviewSource = fullReviewData[i].querySelector('div.tooltip > p').innerText 108 } 109 catch(error){} 110 try{ 111 date = fullReviewData[i].querySelector("#reviews-list .review-date").innerText 112 } 113 catch(error){} 114 try{ 115 title = fullReviewData[i].querySelector("[data-testid='review-title']").innerText 116 } 117 catch(error){} 118 try{ 119 summary = fullReviewData[i].querySelector("[data-testid='review-summary']").innerText 120 } 121 catch(error){} 122 try{ 123 pros = fullReviewData[i].querySelector("[data-testid='review-pros']").innerText 124 } 125 catch(error){} 126 try{ 127 cons = fullReviewData[i].querySelector("[data-testid='review-cons']").innerText 128 } 129 catch(error){} 130 results.push({"score":score,"companySize":companySize,"industry":industry,"timeUsed":timeUsed,"reviewSource":reviewSource,"date":date,"title":title,"summary":summary,"pros":pros,"cons":cons}); 131 132 } 133 var button = document.getElementsByClassName("next"); 134 if (button == null || button.length != 1){ 135 button = null; 136 } 137 else { 138 button[0].click(); 139 await context.waitFor(1500); 140 } 141 console.log("Hi"); 142 }while(button != null && results.length <= 250) // putting a limit on this so it does not time out 143 144 145 // Print some information to actor log 146 147 148 // Manually add a new page to the queue for scraping 149 150 // Return an object with the data extracted from the page. 151 // It will be stored to the resulting dataset. 152 return { 153 results: results 154 }; 155 }, 156 "postNavigationHooks": `// We need to return array of (possibly async) functions here. 157 // The functions accept a single argument: the "crawlingContext" object. 158 [ 159 async (crawlingContext) => { 160 // ... 161 }, 162 ]`, 163 "preNavigationHooks": `// We need to return array of (possibly async) functions here. 164 // The functions accept two arguments: the "crawlingContext" object 165 // and "gotoOptions". 166 [ 167 async (crawlingContext, gotoOptions) => { 168 // ... 169 }, 170 ]`, 171 "proxyConfiguration": { 172 "useApifyProxy": true, 173 "apifyProxyCountry": "US" 174 }, 175 "startUrls": input.startUrls, 176 "runMode": "PRODUCTION", 177 "useChrome": false, 178 "waitUntil": [ 179 "networkidle2" 180 ] 181 }; 182 183 // Now let's metamorph into actor apify/web-scraper using the created input. 184 await Apify.metamorph('apify/web-scraper', metamorphInput); 185});

package.json

1{ 2 "name": "my-actor", 3 "version": "0.0.1", 4 "dependencies": { 5 "apify": "^2.2.2" 6 }, 7 "scripts": { 8 "start": "node main.js" 9 }, 10 "author": "Me!" 11} 12
Developer
Maintained by Community
Actor stats
  • 8 users
  • 351 runs
  • Modified 10 months ago
Categories

You might also like these Actors