Softwareadvicescrapereviews
Under maintenance
Pricing
Pay per usage
Go to Store
Softwareadvicescrapereviews
Under maintenance
0.0 (0)
Pricing
Pay per usage
1
Total users
19
Monthly users
1
Runs succeeded
>99%
Last modified
2 years ago
Under maintenance
Pricing
Pay per usage
Under maintenance
0.0 (0)
Pricing
Pay per usage
1
Total users
19
Monthly users
1
Runs succeeded
>99%
Last modified
2 years ago
# First, specify the base Docker image. You can read more about# the available images at https://sdk.apify.com/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:16
# Second, copy just package.json and package-lock.json since those are the only# files that affect "npm install" in the next step, to speed up the build.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --only=prod --no-optional \ && echo "Installed NPM packages:" \ && (npm list || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Optionally, specify how to launch the source code of your actor.# By default, Apify's base Docker images define the CMD instruction# that runs the Node.js source code using the command specified# in the "scripts.start" section of the package.json file.# In short, the instruction looks something like this:## CMD npm start
{ "title": "Input Schema", "description": "To update crawler to another site, you need to change startUrls options!", "type": "object", "schemaVersion": 1, "properties": { "startUrls": { "title": "Start URLs", "type": "array", "description": "A static list of URLs to scrape. It is recommended to only use one start url. Just navigate to a company/product page and grab the url like so (ex: https://www.softwareadvice.com/fleet-management/gps-insight-profile/) and use that as the start URL.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.", "prefill": [{ "url": "https://www.softwareadvice.com/fleet-management/gps-insight-profile/" }], "editor": "requestListSources" }, "maxRequestRetries": { "title": "Max page retries", "type": "integer", "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.", "minimum": 0, "prefill": 2, "default": 2 } }, "required": ["startUrls"]}
1// This is the main Node.js source code file of your actor.2// It is referenced from the "scripts" section of the package.json file.3
4const Apify = require('apify');5
6Apify.main(async () => {7 // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.8 // For more information, see https://docs.apify.com/actors/development/input-schema9 const input = await Apify.getInput();10 console.log('Input:');11 console.dir(input);12
13 // Here you can prepare your input for actor apify/web-scraper this input is based on a actor14 // task you used as the starting point.15 const metamorphInput = {16 "breakpointLocation": "NONE",17 "browserLog": false,18 "debugLog": false,19 "downloadCss": true,20 "downloadMedia": true,21 "ignoreCorsAndCsp": false,22 "ignoreSslErrors": false,23 "injectJQuery": true,24 "keepUrlFragments": false,25 "maxRequestRetries": input.maxRequestRetries,26 "pageFunction": // The function accepts a single argument: the "context" object.27 // For a complete list of its properties and functions,28 // see https://apify.com/apify/web-scraper#page-function 29 async function pageFunction(context) {30 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!31 // debugger; 32 // jQuery is handy for finding DOM elements and extracting data from them.33 // To use it, make sure to enable the "Inject jQuery" option.34 const $ = context.jQuery;35 36 let timeoutMillis; // undefined37 await context.waitFor(1000);38 // 2 sec timeout after the first.39 // get elements contained the wrapper class40 var node = document.querySelector(".wrapper")41 var href = "";42 // if this is not the reviews page, check to see if it has a full reviews page43 if (!context.request.url.endsWith("/reviews/")){44 context.log.info(context.request.url);45 46 for (let i= 0; i < node.childNodes.length; i++)47 {48 if(node.childNodes[i].childNodes[0].innerText == 'Reviews'){49 50 try{51 href = node.childNodes[i].childNodes[0].href;52 }catch{}53 break;54 }55 }56 context.log.info(href)57 // if the a tag had an href attribute, that means this has a full reviews page, so we will navigate there to get all the reviews58 if(href != ""){59 await context.enqueueRequest({ url: href });60 return;61 }62 }63 64 await context.waitFor(1500);65 // scrape all the reviews66 var fullReviewData = $("[data-testid='reviews-container']");67 68 69 var results = [];70 do{71 for (let i = 0; i < fullReviewData.length; i++ )72 {73 var score = "";74 var companySize = "";75 var industry = "";76 var timeUsed = "";77 var reviewSource = "";78 var date = "";79 var title = "";80 var summary = "";81 var pros = "";82 var cons = "";83 var date = "";84 85 try{86 score = fullReviewData[i].querySelectorAll("[data-testid='reviewers-rating'] .OverallStarRatingComponent .fullStar").length;87 }catch(error){}88 89 try{90 companySize = fullReviewData[i].querySelector('div.review-company > p > strong').innerText;91 }92 catch(error){}93 94 95 96 try{97 industry = fullReviewData[i].querySelector('div.review-gdm-industry > p > strong').innerText98 }99 catch(error){100 101 }102 try{103 timeUsed = fullReviewData[i].querySelector('div.review-profile-time-used > p > strong').innerText104 }105 catch(error){}106 try{107 reviewSource = fullReviewData[i].querySelector('div.tooltip > p').innerText108 }109 catch(error){}110 try{ 111 date = fullReviewData[i].querySelector("#reviews-list .review-date").innerText112 }113 catch(error){}114 try{ 115 title = fullReviewData[i].querySelector("[data-testid='review-title']").innerText116 }117 catch(error){}118 try{119 summary = fullReviewData[i].querySelector("[data-testid='review-summary']").innerText120 }121 catch(error){}122 try{123 pros = fullReviewData[i].querySelector("[data-testid='review-pros']").innerText124 }125 catch(error){}126 try{127 cons = fullReviewData[i].querySelector("[data-testid='review-cons']").innerText128 }129 catch(error){}130 results.push({"score":score,"companySize":companySize,"industry":industry,"timeUsed":timeUsed,"reviewSource":reviewSource,"date":date,"title":title,"summary":summary,"pros":pros,"cons":cons});131 132 }133 var button = document.getElementsByClassName("next");134 if (button == null || button.length != 1){135 button = null;136 }137 else {138 button[0].click();139 await context.waitFor(1500);140 }141 console.log("Hi");142 }while(button != null && results.length <= 250) // putting a limit on this so it does not time out143 144 145 // Print some information to actor log146 147 148 // Manually add a new page to the queue for scraping149 150 // Return an object with the data extracted from the page.151 // It will be stored to the resulting dataset.152 return {153 results: results154 };155 },156 "postNavigationHooks": `// We need to return array of (possibly async) functions here.157 // The functions accept a single argument: the "crawlingContext" object.158 [159 async (crawlingContext) => {160 // ...161 },162 ]`,163 "preNavigationHooks": `// We need to return array of (possibly async) functions here.164 // The functions accept two arguments: the "crawlingContext" object165 // and "gotoOptions".166 [167 async (crawlingContext, gotoOptions) => {168 // ...169 },170 ]`,171 "proxyConfiguration": {172 "useApifyProxy": true,173 "apifyProxyCountry": "US"174 },175 "startUrls": input.startUrls,176 "runMode": "PRODUCTION",177 "useChrome": false,178 "waitUntil": [179 "networkidle2"180 ]181 };182
183 // Now let's metamorph into actor apify/web-scraper using the created input.184 await Apify.metamorph('apify/web-scraper', metamorphInput);185});
{ "name": "my-actor", "version": "0.0.1", "dependencies": { "apify": "^2.2.2" }, "scripts": { "start": "node main.js" }, "author": "Me!"}