Softwareadvicescrapereviews avatar
Softwareadvicescrapereviews

Pricing

Pay per usage

Go to Store
Softwareadvicescrapereviews

Softwareadvicescrapereviews

Developed by

Matt

Matt

Maintained by Community

0.0 (0)

Pricing

Pay per usage

1

Total users

19

Monthly users

2

Runs succeeded

>99%

Last modified

2 years ago

Dockerfile

# First, specify the base Docker image. You can read more about
# the available images at https://sdk.apify.com/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node:16
# Second, copy just package.json and package-lock.json since those are the only
# files that affect "npm install" in the next step, to speed up the build.
COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY . ./
# Optionally, specify how to launch the source code of your actor.
# By default, Apify's base Docker images define the CMD instruction
# that runs the Node.js source code using the command specified
# in the "scripts.start" section of the package.json file.
# In short, the instruction looks something like this:
#
# CMD npm start

INPUT_SCHEMA.json

{
"title": "Input Schema",
"description": "To update crawler to another site, you need to change startUrls options!",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "A static list of URLs to scrape. It is recommended to only use one start url. Just navigate to a company/product page and grab the url like so (ex: https://www.softwareadvice.com/fleet-management/gps-insight-profile/) and use that as the start URL.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
"prefill": [{ "url": "https://www.softwareadvice.com/fleet-management/gps-insight-profile/" }],
"editor": "requestListSources"
},
"maxRequestRetries": {
"title": "Max page retries",
"type": "integer",
"description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.",
"minimum": 0,
"prefill": 2,
"default": 2
}
},
"required": ["startUrls"]
}

main.js

1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7 // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8 // For more information, see https://docs.apify.com/actors/development/input-schema
9 const input = await Apify.getInput();
10 console.log('Input:');
11 console.dir(input);
12
13 // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14 // task you used as the starting point.
15 const metamorphInput = {
16 "breakpointLocation": "NONE",
17 "browserLog": false,
18 "debugLog": false,
19 "downloadCss": true,
20 "downloadMedia": true,
21 "ignoreCorsAndCsp": false,
22 "ignoreSslErrors": false,
23 "injectJQuery": true,
24 "keepUrlFragments": false,
25 "maxRequestRetries": input.maxRequestRetries,
26 "pageFunction": // The function accepts a single argument: the "context" object.
27 // For a complete list of its properties and functions,
28 // see https://apify.com/apify/web-scraper#page-function
29 async function pageFunction(context) {
30 // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
31 // debugger;
32 // jQuery is handy for finding DOM elements and extracting data from them.
33 // To use it, make sure to enable the "Inject jQuery" option.
34 const $ = context.jQuery;
35
36 let timeoutMillis; // undefined
37 await context.waitFor(1000);
38 // 2 sec timeout after the first.
39 // get elements contained the wrapper class
40 var node = document.querySelector(".wrapper")
41 var href = "";
42 // if this is not the reviews page, check to see if it has a full reviews page
43 if (!context.request.url.endsWith("/reviews/")){
44 context.log.info(context.request.url);
45
46 for (let i= 0; i < node.childNodes.length; i++)
47 {
48 if(node.childNodes[i].childNodes[0].innerText == 'Reviews'){
49
50 try{
51 href = node.childNodes[i].childNodes[0].href;
52 }catch{}
53 break;
54 }
55 }
56 context.log.info(href)
57 // if the a tag had an href attribute, that means this has a full reviews page, so we will navigate there to get all the reviews
58 if(href != ""){
59 await context.enqueueRequest({ url: href });
60 return;
61 }
62 }
63
64 await context.waitFor(1500);
65 // scrape all the reviews
66 var fullReviewData = $("[data-testid='reviews-container']");
67
68
69 var results = [];
70 do{
71 for (let i = 0; i < fullReviewData.length; i++ )
72 {
73 var score = "";
74 var companySize = "";
75 var industry = "";
76 var timeUsed = "";
77 var reviewSource = "";
78 var date = "";
79 var title = "";
80 var summary = "";
81 var pros = "";
82 var cons = "";
83 var date = "";
84
85 try{
86 score = fullReviewData[i].querySelectorAll("[data-testid='reviewers-rating'] .OverallStarRatingComponent .fullStar").length;
87 }catch(error){}
88
89 try{
90 companySize = fullReviewData[i].querySelector('div.review-company > p > strong').innerText;
91 }
92 catch(error){}
93
94
95
96 try{
97 industry = fullReviewData[i].querySelector('div.review-gdm-industry > p > strong').innerText
98 }
99 catch(error){
100
101 }
102 try{
103 timeUsed = fullReviewData[i].querySelector('div.review-profile-time-used > p > strong').innerText
104 }
105 catch(error){}
106 try{
107 reviewSource = fullReviewData[i].querySelector('div.tooltip > p').innerText
108 }
109 catch(error){}
110 try{
111 date = fullReviewData[i].querySelector("#reviews-list .review-date").innerText
112 }
113 catch(error){}
114 try{
115 title = fullReviewData[i].querySelector("[data-testid='review-title']").innerText
116 }
117 catch(error){}
118 try{
119 summary = fullReviewData[i].querySelector("[data-testid='review-summary']").innerText
120 }
121 catch(error){}
122 try{
123 pros = fullReviewData[i].querySelector("[data-testid='review-pros']").innerText
124 }
125 catch(error){}
126 try{
127 cons = fullReviewData[i].querySelector("[data-testid='review-cons']").innerText
128 }
129 catch(error){}
130 results.push({"score":score,"companySize":companySize,"industry":industry,"timeUsed":timeUsed,"reviewSource":reviewSource,"date":date,"title":title,"summary":summary,"pros":pros,"cons":cons});
131
132 }
133 var button = document.getElementsByClassName("next");
134 if (button == null || button.length != 1){
135 button = null;
136 }
137 else {
138 button[0].click();
139 await context.waitFor(1500);
140 }
141 console.log("Hi");
142 }while(button != null && results.length <= 250) // putting a limit on this so it does not time out
143
144
145 // Print some information to actor log
146
147
148 // Manually add a new page to the queue for scraping
149
150 // Return an object with the data extracted from the page.
151 // It will be stored to the resulting dataset.
152 return {
153 results: results
154 };
155 },
156 "postNavigationHooks": `// We need to return array of (possibly async) functions here.
157 // The functions accept a single argument: the "crawlingContext" object.
158 [
159 async (crawlingContext) => {
160 // ...
161 },
162 ]`,
163 "preNavigationHooks": `// We need to return array of (possibly async) functions here.
164 // The functions accept two arguments: the "crawlingContext" object
165 // and "gotoOptions".
166 [
167 async (crawlingContext, gotoOptions) => {
168 // ...
169 },
170 ]`,
171 "proxyConfiguration": {
172 "useApifyProxy": true,
173 "apifyProxyCountry": "US"
174 },
175 "startUrls": input.startUrls,
176 "runMode": "PRODUCTION",
177 "useChrome": false,
178 "waitUntil": [
179 "networkidle2"
180 ]
181 };
182
183 // Now let's metamorph into actor apify/web-scraper using the created input.
184 await Apify.metamorph('apify/web-scraper', metamorphInput);
185});

package.json

{
"name": "my-actor",
"version": "0.0.1",
"dependencies": {
"apify": "^2.2.2"
},
"scripts": {
"start": "node main.js"
},
"author": "Me!"
}