Softwareadvicescrapereviews avatar

Softwareadvicescrapereviews

Under maintenance
Try for free

No credit card required

Go to Store
This Actor is under maintenance.

This Actor may be unreliable while under maintenance. Would you like to try a similar Actor instead?

See alternative Actors
Softwareadvicescrapereviews

Softwareadvicescrapereviews

undrtkr984/softwareadvicescrapereviews
Try for free

No credit card required

Dockerfile

1# First, specify the base Docker image. You can read more about
2# the available images at https://sdk.apify.com/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:16
5
6# Second, copy just package.json and package-lock.json since those are the only
7# files that affect "npm install" in the next step, to speed up the build.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --only=prod --no-optional \
15 && echo "Installed NPM packages:" \
16 && (npm list || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version
21
22# Next, copy the remaining files and directories with the source code.
23# Since we do this after NPM install, quick build will be really fast
24# for most source file changes.
25COPY . ./
26
27# Optionally, specify how to launch the source code of your actor.
28# By default, Apify's base Docker images define the CMD instruction
29# that runs the Node.js source code using the command specified
30# in the "scripts.start" section of the package.json file.
31# In short, the instruction looks something like this:
32#
33# CMD npm start

INPUT_SCHEMA.json

1{
2    "title": "Input Schema",
3    "description": "To update crawler to another site, you need to change startUrls options!",
4    "type": "object",
5    "schemaVersion": 1,
6    "properties": {
7        "startUrls": {
8            "title": "Start URLs",
9            "type": "array",
10            "description": "A static list of URLs to scrape. It is recommended to only use one start url.  Just navigate to a company/product page and grab the url like so (ex: https://www.softwareadvice.com/fleet-management/gps-insight-profile/) and use that as the start URL.<br><br>For details, see <a href='https://apify.com/apify/web-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
11            "prefill": [{ "url": "https://www.softwareadvice.com/fleet-management/gps-insight-profile/" }],
12            "editor": "requestListSources"
13        },
14        "maxRequestRetries": {
15            "title": "Max page retries",
16            "type": "integer",
17            "description": "The maximum number of times the scraper will retry to load each web page on error, in case of a page load error or an exception thrown by <b>Page function</b>.<br><br>If set to <code>0</code>, the page will be considered failed right after the first error.",
18            "minimum": 0,
19            "prefill": 2,
20            "default": 2
21        }
22    },
23    "required": ["startUrls"]
24}

main.js

1// This is the main Node.js source code file of your actor.
2// It is referenced from the "scripts" section of the package.json file.
3
4const Apify = require('apify');
5
6Apify.main(async () => {
7    // Get input of the actor. Input fields can be modified in INPUT_SCHEMA.json file.
8    // For more information, see https://docs.apify.com/actors/development/input-schema
9    const input = await Apify.getInput();
10    console.log('Input:');
11    console.dir(input);
12
13    // Here you can prepare your input for actor apify/web-scraper this input is based on a actor
14    // task you used as the starting point.
15    const metamorphInput = {
16        "breakpointLocation": "NONE",
17        "browserLog": false,
18        "debugLog": false,
19        "downloadCss": true,
20        "downloadMedia": true,
21        "ignoreCorsAndCsp": false,
22        "ignoreSslErrors": false,
23        "injectJQuery": true,
24        "keepUrlFragments": false,
25        "maxRequestRetries": input.maxRequestRetries,
26        "pageFunction": // The function accepts a single argument: the "context" object.
27        // For a complete list of its properties and functions,
28        // see https://apify.com/apify/web-scraper#page-function 
29        async function pageFunction(context) {
30            // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
31            // debugger; 
32            // jQuery is handy for finding DOM elements and extracting data from them.
33            // To use it, make sure to enable the "Inject jQuery" option.
34            const $ = context.jQuery;
35        
36            let timeoutMillis; // undefined
37            await context.waitFor(1000);
38                // 2 sec timeout after the first.
39                // get elements contained the wrapper class
40            var node = document.querySelector(".wrapper")
41            var href = "";
42            // if this is not the reviews page, check to see if it has a full reviews page
43            if (!context.request.url.endsWith("/reviews/")){
44                context.log.info(context.request.url);
45            
46                for (let i= 0; i < node.childNodes.length; i++)
47                {
48                    if(node.childNodes[i].childNodes[0].innerText == 'Reviews'){
49                        
50                        try{
51                            href = node.childNodes[i].childNodes[0].href;
52                        }catch{}
53                        break;
54                    }
55                }
56                context.log.info(href)
57                // if the a tag had an href attribute, that means this has a full reviews page, so we will navigate there to get all the reviews
58                if(href != ""){
59                    await context.enqueueRequest({ url: href });
60                    return;
61                }
62            }
63        
64            await context.waitFor(1500);
65            // scrape all the reviews
66            var fullReviewData = $("[data-testid='reviews-container']");
67        
68        
69            var results = [];
70            do{
71            for (let i = 0; i < fullReviewData.length; i++ )
72            {
73                var score = "";
74                var companySize = "";
75                var industry = "";
76                var timeUsed = "";
77                var reviewSource = "";
78                var date = "";
79                var title = "";
80                var summary = "";
81                var pros = "";
82                var cons = "";
83                var date = "";
84        
85                try{
86                    score = fullReviewData[i].querySelectorAll("[data-testid='reviewers-rating'] .OverallStarRatingComponent .fullStar").length;
87                }catch(error){}
88        
89                try{
90                    companySize = fullReviewData[i].querySelector('div.review-company > p > strong').innerText;
91                }
92                catch(error){}
93        
94        
95        
96                try{
97                    industry = fullReviewData[i].querySelector('div.review-gdm-industry > p > strong').innerText
98                }
99                catch(error){
100        
101                }
102                try{
103                    timeUsed = fullReviewData[i].querySelector('div.review-profile-time-used > p > strong').innerText
104                }
105                catch(error){}
106                try{
107                    reviewSource = fullReviewData[i].querySelector('div.tooltip > p').innerText
108                }
109                catch(error){}
110                try{ 
111                    date = fullReviewData[i].querySelector("#reviews-list .review-date").innerText
112                }
113                catch(error){}
114                try{ 
115                    title = fullReviewData[i].querySelector("[data-testid='review-title']").innerText
116                }
117                catch(error){}
118                try{
119                    summary = fullReviewData[i].querySelector("[data-testid='review-summary']").innerText
120                }
121                catch(error){}
122                try{
123                    pros = fullReviewData[i].querySelector("[data-testid='review-pros']").innerText
124                }
125                catch(error){}
126                try{
127                    cons = fullReviewData[i].querySelector("[data-testid='review-cons']").innerText
128                }
129                catch(error){}
130                results.push({"score":score,"companySize":companySize,"industry":industry,"timeUsed":timeUsed,"reviewSource":reviewSource,"date":date,"title":title,"summary":summary,"pros":pros,"cons":cons});
131                
132            }
133            var button = document.getElementsByClassName("next");
134            if (button == null || button.length != 1){
135                button = null;
136            }
137            else {
138                button[0].click();
139                await context.waitFor(1500);
140            }
141            console.log("Hi");
142            }while(button != null  && results.length <= 250) // putting a limit on this so it does not time out
143            
144        
145            // Print some information to actor log
146            
147        
148            // Manually add a new page to the queue for scraping
149        
150            // Return an object with the data extracted from the page.
151            // It will be stored to the resulting dataset.
152            return {
153                results: results
154            };
155        },
156        "postNavigationHooks": `// We need to return array of (possibly async) functions here.
157            // The functions accept a single argument: the "crawlingContext" object.
158            [
159                async (crawlingContext) => {
160                    // ...
161                },
162            ]`,
163        "preNavigationHooks": `// We need to return array of (possibly async) functions here.
164            // The functions accept two arguments: the "crawlingContext" object
165            // and "gotoOptions".
166            [
167                async (crawlingContext, gotoOptions) => {
168                    // ...
169                },
170            ]`,
171        "proxyConfiguration": {
172            "useApifyProxy": true,
173            "apifyProxyCountry": "US"
174        },
175        "startUrls": input.startUrls,
176        "runMode": "PRODUCTION",
177        "useChrome": false,
178        "waitUntil": [
179            "networkidle2"
180        ]
181    };
182
183    // Now let's metamorph into actor apify/web-scraper using the created input.
184    await Apify.metamorph('apify/web-scraper', metamorphInput);
185});

package.json

1{
2    "name": "my-actor",
3    "version": "0.0.1",
4    "dependencies": {
5        "apify": "^2.2.2"
6    },
7    "scripts": {
8        "start": "node main.js"
9    },
10    "author": "Me!"
11}
Developer
Maintained by Community

Actor Metrics

  • 2 monthly users

  • 1 star

  • >99% runs succeeded

  • Created in Jan 2023

  • Modified 2 years ago