skynet-scrapper avatar

skynet-scrapper

Try for free

No credit card required

View all Actors
skynet-scrapper

skynet-scrapper

tech_simphony/skynet-scrapper
Try for free

No credit card required

.actor/Dockerfile

1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14    && npm install --omit=dev --omit=optional \
15    && echo "Installed NPM packages:" \
16    && (npm list --omit=dev --all || true) \
17    && echo "Node.js version:" \
18    && node --version \
19    && echo "NPM version:" \
20    && npm --version \
21    && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm run start

.actor/actor.json

1{
2    "actorSpecification": 1,
3    "name": "skinet-scraper",
4    "title": "skinet scraper",
5    "version": "1.0.0",
6    "input": "./input_schema.json",
7    "dockerfile": "./Dockerfile",
8    "storages": {
9        "dataset": "./dataset_schema.json"
10    }
11}

.actor/dataset_schema.json

1{
2    "actorSpecification": 1,
3    "fields": {},
4    "views": {
5        "overview": {
6            "title": "Overview",
7            "transformation": {},
8            "display": {
9                "component": "table"
10            }
11        }
12    }
13}

.actor/input_schema.json

1{
2    "title": "Scrape data from a web page",
3    "type": "object",
4    "schemaVersion": 1,
5    "properties": {
6        "queries": {
7            "title": "Search Query",
8            "type": "string",
9            "description": "Search query to use on Google",
10            "editor": "textfield",
11            "prefill": "[tow truck near me california]"
12        },
13        "maxRequestsPerCrawl": {
14            "title": "Max Requests per Crawl",
15            "type": "integer",
16            "description": "Maximum number of requests per crawl",
17            "editor": "number",
18            "prefill": 200
19        }
20    },
21    "required": ["queries"]
22}

.actor/output_schema.json

1{
2    "actorSpecification": 1,
3    "name": "skynet-scraper",
4    "title": "skynet Scraper",
5    "description": "",
6    "version": "1.0.0",
7    "properties": {
8        "url": {
9            "type": "string",
10            "title": "URL",
11            "description": "URL to scrape",
12            "required": true
13        },
14        "title": {
15            "type": "string",
16            "title": "title",
17            "description": "title  to scrape",
18            "required": true
19        },
20        "phoneNumber": {
21            "type": "string",
22            "title": "phoneNumber",
23            "description": "phoneNumber  to scrape",
24            "required": true
25        }
26    },
27    "fields": {},
28    "views": {
29        "overview": {
30            "title": "Overview",
31            "transformation": {},
32            "display": {}
33        }
34    }
35  
36}

src/main.js

1import { Actor, Dataset } from "apify";
2import { CheerioCrawler } from "crawlee";
3import fs from "fs";
4
5try {
6    await Actor.init();
7
8    const input = await Actor.getInput();
9    let { queries, maxRequestsPerCrawl } = input;
10    if (typeof queries === "string") {
11        queries = queries.replace(/^\[|\]$/g, "");
12        queries = `[${queries}]`;
13    }
14    const searchQueries = Array.isArray(queries) ? queries : [queries];
15    const searchQuery = searchQueries.join(" ");
16    const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
17        searchQuery
18    )}`;
19    console.log("URL ", searchUrl);
20
21    const phoneNumberRegex =
22        /(\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})/;
23
24    const resultsData = {
25        searchQuery: {
26            term: queries,
27            url: searchUrl,
28            device: "MOBILE",
29            page: 1,
30            type: "SEARCH",
31            domain: "google.com",
32            countryCode: "US",
33            languageCode: "en",
34            locationUule: null,
35            resultsPerPage: 10,
36        },
37        resultsTotal: "N/A",
38        relatedQueries: [],
39        paidResults: [],
40        paidProducts: [],
41        organicResults: [],
42        peopleAlsoAsk: [],
43    };
44
45    const crawler = new CheerioCrawler({
46        maxRequestsPerCrawl,
47        handlePageFunction: async ({ request, response, $, log }) => {
48            const searchResults = $("div.g, div.uEierd"); // "div.uEierd" is used for some ad blocks
49            console.log("search results:", searchResults.length);
50
51            if (searchResults.length === 0) {
52                console.log("No search results were found.");
53            }
54
55            searchResults.each((index, element) => {
56                const $result = $(element);
57                let title =
58                    $result.find("h3").text().trim() ||
59                    $result.find(".xA33Gc").text().trim(); // Adjusted selector
60                const url = $result.find("a").attr("href") || "";
61                //const description = $result.find('span.VwiC3b').text().trim() || '';
62                const textContent = $result.text();
63                const description = extractDescription(textContent);
64                let phoneNumber = null;
65
66                // search phonenumber - "Call us"
67                const callElements = $result.find('*:contains("Call us")');
68                callElements.each((i, callElement) => {
69                    const callText = $(callElement).text();
70                    const phoneMatch = callText.match(phoneNumberRegex);
71                    if (phoneMatch) {
72                        phoneNumber = phoneMatch[0];
73                    }
74                });
75
76                // Identify if the result is sponsored
77                const isSponsored =
78                    $result.hasClass("uEierd") ||
79                    $result.find("span").text().toLowerCase().includes("ad");
80                let advertiserName = $result
81                    .find("div.specific-class-for-advertiser-name")
82                    .text()
83                    .trim(); // Ejemplo
84
85                const [advertiserNameTrim, phoneNumberTrim] = title
86                    .split("|~|")
87                    .map((str) => str.trim());
88
89                if ((title || url || description) && phoneNumber) {
90                    if (!title && advertiserName) {
91                        // Usa un delimitador único para separar el nombre del anunciante y el número de teléfono
92                        title = `${advertiserName} |~| ${phoneNumber}`;
93                    } else if (advertiserName) {
94                        title = `${advertiserName} |~| ${phoneNumber}`;
95                    } else {
96                        title = `${phoneNumber}`;
97                    }
98
99                    console.log("title:", title);
100
101                    if (isSponsored) {
102                        resultsData.paidResults.push({
103                            title,
104                            url,
105                            phoneNumber,
106                            displayedUrl: url,
107                            description,
108                            emphasizedKeywords: [],
109                            siteLinks: [],
110                            type: "paid",
111                            adPosition: index + 1,
112                            advertiserName: advertiserNameTrim,
113                        });
114                    } else {
115                        resultsData.organicResults.push({
116                            title,
117                            url,
118                            phoneNumber,
119                            displayedUrl: url,
120                            description,
121                            emphasizedKeywords: [],
122                            siteLinks: [],
123                            productInfo: {},
124                            type: "organic",
125                            position: index + 1,
126                            advertiserName: advertiserNameTrim,
127                        });
128                    }
129                } else {
130                    console.log(`Resultado ${index + 1} está vacío o incompleto.`);
131                }
132            });
133
134            // People Also Ask
135            const peopleAlsoAskElements = $("div.related-question-pair");
136            if (peopleAlsoAskElements.length === 0) {
137                console.log("No se encontraron 'People Also Ask'.");
138            }
139
140            peopleAlsoAskElements.each((index, element) => {
141                const $question = $(element).find(".yuRUbf").text().trim();
142                const $answer = $(element).find(".VwiC3b").text().trim();
143                const $url = $(element).find("a").attr("href") || "";
144
145                if ($question || $answer || $url) {
146                    resultsData.peopleAlsoAsk.push({
147                        question: $question,
148                        answer: $answer,
149                        url: $url,
150                        title: $question,
151                        date: "",
152                    });
153                } else {
154                    console.log(
155                        `'People Also Ask' ${index + 1} está vacío o incompleto.`
156                    );
157                }
158            });
159
160            // Related Queries
161            const relatedQueriesElements = $('a[data-hveid="CAEQAw"]');
162            if (relatedQueriesElements.length === 0) {
163                console.log("No se encontraron 'Related Queries'.");
164            }
165
166            relatedQueriesElements.each((index, element) => {
167                const $relatedQuery = $(element).text().trim();
168                const $relatedUrl = $(element).attr("href") || "";
169
170                if ($relatedQuery || $relatedUrl) {
171                    resultsData.relatedQueries.push({
172                        title: $relatedQuery,
173                        url: `https://www.google.com${$relatedUrl}`,
174                    });
175                } else {
176                    console.log(`'Related Query' ${index + 1} está vacío o incompleto.`);
177                }
178            });
179
180            // Handling pagination: Check for the next page link
181            const nextPageLink = $("a#pnnext").attr("href");
182            if (nextPageLink && request.userData.page < 25) {
183                // Limit to 25 pages for this example
184                await crawler.addRequests([
185                    {
186                        url: `https://www.google.com${nextPageLink}`,
187                        userData: { page: request.userData.page + 1 },
188                    },
189                ]);
190            }
191        },
192    });
193
194    const extractDescription = (textContent) => {
195        // Elimina URLs y todo el texto antes de la URL
196        const cleanedText = textContent
197            .replace(/https?:\/\/[^\s]+|\/\/[^\s]+/g, "")
198            .trim();
199
200        // Elimina cualquier texto al final que no sea necesario (por ejemplo, el texto después de '›' o ' · ')
201        const description = cleanedText
202            .replace(/(?: ›| · | - |:| · )[^›]*$/, "")
203            .trim();
204
205        return description;
206    };
207
208    await crawler.run([
209        {
210            url: searchUrl,
211            userData: { page: 1 },
212        },
213    ]);
214
215    if (!resultsData.paidResults.length && !resultsData.organicResults.length) {
216        console.log("No se encontraron resultados de búsqueda.");
217    }
218
219    // save result dataset of Apify
220    await Actor.setValue("OUTPUT", resultsData);
221    await Dataset.pushData(resultsData);
222
223    // write file JSON
224    fs.writeFileSync("./output.json", JSON.stringify(resultsData, null, 2));
225    console.log("Datos guardados en output.json");
226
227    await Actor.exit();
228} catch (error) {
229    console.error("Se produjo un error durante la ejecución del actor:", error);
230    process.exit(1);
231}

.dockerignore

1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git

.editorconfig

1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf

.eslintrc

1{
2    "extends": "@apify",
3    "root": true
4}

.gitignore

1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage

package.json

1{
2	"name": "my-web-scrapper",
3	"version": "0.0.1",
4	"type": "module",
5	"description": "This is an example of an Apify actor.",
6	"engines": {
7		"node": ">=18.0.0"
8	},
9	"dependencies": {
10		"@crawlee/http": "^3.9.2",
11		"apify": "^3.1.10",
12		"apify-client": "^2.9.3",
13		"axios": "^1.5.0",
14		"cheerio": "^1.0.0-rc.12",
15		"crawlee": "^3.9.2",
16		"random-useragent": "^0.5.0"
17	},
18	"scripts": {
19		"start": "node ./src/main.js",
20		"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
21	},
22	"author": "It's not you it's me",
23	"license": "ISC"
24}
Developer
Maintained by Community
Actor metrics
  • 1 monthly user
  • 1 star
  • 100.0% runs succeeded
  • Created in May 2024
  • Modified about 1 month ago
Categories