skynet-scrapper
Try for free
No credit card required
Go to Store
skynet-scrapper
tech_simphony/skynet-scrapper
Try for free
No credit card required
.actor/Dockerfile
1# Specify the base Docker image. You can read more about
2# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images
3# You can also use any other image from Docker Hub.
4FROM apify/actor-node:20
5
6# Copy just package.json and package-lock.json
7# to speed up the build using Docker layer cache.
8COPY package*.json ./
9
10# Install NPM packages, skip optional and development dependencies to
11# keep the image small. Avoid logging too much and print the dependency
12# tree for debugging
13RUN npm --quiet set progress=false \
14 && npm install --omit=dev --omit=optional \
15 && echo "Installed NPM packages:" \
16 && (npm list --omit=dev --all || true) \
17 && echo "Node.js version:" \
18 && node --version \
19 && echo "NPM version:" \
20 && npm --version \
21 && rm -r ~/.npm
22
23# Next, copy the remaining files and directories with the source code.
24# Since we do this after NPM install, quick build will be really fast
25# for most source file changes.
26COPY . ./
27
28
29# Run the image.
30CMD npm run start
.actor/actor.json
1{
2 "actorSpecification": 1,
3 "name": "skinet-scraper",
4 "title": "skinet scraper",
5 "version": "1.0.0",
6 "input": "./input_schema.json",
7 "dockerfile": "./Dockerfile",
8 "storages": {
9 "dataset": "./dataset_schema.json"
10 }
11}
.actor/dataset_schema.json
1{
2 "actorSpecification": 1,
3 "fields": {},
4 "views": {
5 "overview": {
6 "title": "Overview",
7 "transformation": {},
8 "display": {
9 "component": "table"
10 }
11 }
12 }
13}
.actor/input_schema.json
1{
2 "title": "Scrape data from a web page",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "queries": {
7 "title": "Search Query",
8 "type": "string",
9 "description": "Search query to use on Google",
10 "editor": "textfield",
11 "prefill": "[tow truck near me california]"
12 },
13 "maxRequestsPerCrawl": {
14 "title": "Max Requests per Crawl",
15 "type": "integer",
16 "description": "Maximum number of requests per crawl",
17 "editor": "number",
18 "prefill": 200
19 }
20 },
21 "required": ["queries"]
22}
.actor/output_schema.json
1{
2 "actorSpecification": 1,
3 "name": "skynet-scraper",
4 "title": "skynet Scraper",
5 "description": "",
6 "version": "1.0.0",
7 "properties": {
8 "url": {
9 "type": "string",
10 "title": "URL",
11 "description": "URL to scrape",
12 "required": true
13 },
14 "title": {
15 "type": "string",
16 "title": "title",
17 "description": "title to scrape",
18 "required": true
19 },
20 "phoneNumber": {
21 "type": "string",
22 "title": "phoneNumber",
23 "description": "phoneNumber to scrape",
24 "required": true
25 }
26 },
27 "fields": {},
28 "views": {
29 "overview": {
30 "title": "Overview",
31 "transformation": {},
32 "display": {}
33 }
34 }
35
36}
src/main.js
1import { Actor, Dataset } from "apify";
2import { CheerioCrawler } from "crawlee";
3import fs from "fs";
4
5try {
6 await Actor.init();
7
8 const input = await Actor.getInput();
9 let { queries, maxRequestsPerCrawl } = input;
10 if (typeof queries === "string") {
11 queries = queries.replace(/^\[|\]$/g, "");
12 queries = `[${queries}]`;
13 }
14 const searchQueries = Array.isArray(queries) ? queries : [queries];
15 const searchQuery = searchQueries.join(" ");
16 const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(
17 searchQuery
18 )}`;
19 console.log("URL ", searchUrl);
20
21 const phoneNumberRegex =
22 /(\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})/;
23
24 const resultsData = {
25 searchQuery: {
26 term: queries,
27 url: searchUrl,
28 device: "MOBILE",
29 page: 1,
30 type: "SEARCH",
31 domain: "google.com",
32 countryCode: "US",
33 languageCode: "en",
34 locationUule: null,
35 resultsPerPage: 10,
36 },
37 resultsTotal: "N/A",
38 relatedQueries: [],
39 paidResults: [],
40 paidProducts: [],
41 organicResults: [],
42 peopleAlsoAsk: [],
43 };
44
45 const crawler = new CheerioCrawler({
46 maxRequestsPerCrawl,
47 handlePageFunction: async ({ request, response, $, log }) => {
48 const searchResults = $("div.g, div.uEierd"); // "div.uEierd" is used for some ad blocks
49 console.log("search results:", searchResults.length);
50
51 if (searchResults.length === 0) {
52 console.log("No search results were found.");
53 }
54
55 searchResults.each((index, element) => {
56 const $result = $(element);
57 let title =
58 $result.find("h3").text().trim() ||
59 $result.find(".xA33Gc").text().trim(); // Adjusted selector
60 const url = $result.find("a").attr("href") || "";
61 //const description = $result.find('span.VwiC3b').text().trim() || '';
62 const textContent = $result.text();
63 const description = extractDescription(textContent);
64 let phoneNumber = null;
65
66 // search phonenumber - "Call us"
67 const callElements = $result.find('*:contains("Call us")');
68 callElements.each((i, callElement) => {
69 const callText = $(callElement).text();
70 const phoneMatch = callText.match(phoneNumberRegex);
71 if (phoneMatch) {
72 phoneNumber = phoneMatch[0];
73 }
74 });
75
76 // Identify if the result is sponsored
77 const isSponsored =
78 $result.hasClass("uEierd") ||
79 $result.find("span").text().toLowerCase().includes("ad");
80 let advertiserName = $result
81 .find("div.specific-class-for-advertiser-name")
82 .text()
83 .trim(); // Ejemplo
84
85 const [advertiserNameTrim, phoneNumberTrim] = title
86 .split("|~|")
87 .map((str) => str.trim());
88
89 if ((title || url || description) && phoneNumber) {
90 if (!title && advertiserName) {
91 // Usa un delimitador único para separar el nombre del anunciante y el número de teléfono
92 title = `${advertiserName} |~| ${phoneNumber}`;
93 } else if (advertiserName) {
94 title = `${advertiserName} |~| ${phoneNumber}`;
95 } else {
96 title = `${phoneNumber}`;
97 }
98
99 console.log("title:", title);
100
101 if (isSponsored) {
102 resultsData.paidResults.push({
103 title,
104 url,
105 phoneNumber,
106 displayedUrl: url,
107 description,
108 emphasizedKeywords: [],
109 siteLinks: [],
110 type: "paid",
111 adPosition: index + 1,
112 advertiserName: advertiserNameTrim,
113 });
114 } else {
115 resultsData.organicResults.push({
116 title,
117 url,
118 phoneNumber,
119 displayedUrl: url,
120 description,
121 emphasizedKeywords: [],
122 siteLinks: [],
123 productInfo: {},
124 type: "organic",
125 position: index + 1,
126 advertiserName: advertiserNameTrim,
127 });
128 }
129 } else {
130 console.log(`Resultado ${index + 1} está vacío o incompleto.`);
131 }
132 });
133
134 // People Also Ask
135 const peopleAlsoAskElements = $("div.related-question-pair");
136 if (peopleAlsoAskElements.length === 0) {
137 console.log("No se encontraron 'People Also Ask'.");
138 }
139
140 peopleAlsoAskElements.each((index, element) => {
141 const $question = $(element).find(".yuRUbf").text().trim();
142 const $answer = $(element).find(".VwiC3b").text().trim();
143 const $url = $(element).find("a").attr("href") || "";
144
145 if ($question || $answer || $url) {
146 resultsData.peopleAlsoAsk.push({
147 question: $question,
148 answer: $answer,
149 url: $url,
150 title: $question,
151 date: "",
152 });
153 } else {
154 console.log(
155 `'People Also Ask' ${index + 1} está vacío o incompleto.`
156 );
157 }
158 });
159
160 // Related Queries
161 const relatedQueriesElements = $('a[data-hveid="CAEQAw"]');
162 if (relatedQueriesElements.length === 0) {
163 console.log("No se encontraron 'Related Queries'.");
164 }
165
166 relatedQueriesElements.each((index, element) => {
167 const $relatedQuery = $(element).text().trim();
168 const $relatedUrl = $(element).attr("href") || "";
169
170 if ($relatedQuery || $relatedUrl) {
171 resultsData.relatedQueries.push({
172 title: $relatedQuery,
173 url: `https://www.google.com${$relatedUrl}`,
174 });
175 } else {
176 console.log(`'Related Query' ${index + 1} está vacío o incompleto.`);
177 }
178 });
179
180 // Handling pagination: Check for the next page link
181 const nextPageLink = $("a#pnnext").attr("href");
182 if (nextPageLink && request.userData.page < 25) {
183 // Limit to 25 pages for this example
184 await crawler.addRequests([
185 {
186 url: `https://www.google.com${nextPageLink}`,
187 userData: { page: request.userData.page + 1 },
188 },
189 ]);
190 }
191 },
192 });
193
194 const extractDescription = (textContent) => {
195 // Elimina URLs y todo el texto antes de la URL
196 const cleanedText = textContent
197 .replace(/https?:\/\/[^\s]+|\/\/[^\s]+/g, "")
198 .trim();
199
200 // Elimina cualquier texto al final que no sea necesario (por ejemplo, el texto después de '›' o ' · ')
201 const description = cleanedText
202 .replace(/(?: ›| · | - |:| · )[^›]*$/, "")
203 .trim();
204
205 return description;
206 };
207
208 await crawler.run([
209 {
210 url: searchUrl,
211 userData: { page: 1 },
212 },
213 ]);
214
215 if (!resultsData.paidResults.length && !resultsData.organicResults.length) {
216 console.log("No se encontraron resultados de búsqueda.");
217 }
218
219 // save result dataset of Apify
220 await Actor.setValue("OUTPUT", resultsData);
221 await Dataset.pushData(resultsData);
222
223 // write file JSON
224 fs.writeFileSync("./output.json", JSON.stringify(resultsData, null, 2));
225 console.log("Datos guardados en output.json");
226
227 await Actor.exit();
228} catch (error) {
229 console.error("Se produjo un error durante la ejecución del actor:", error);
230 process.exit(1);
231}
.dockerignore
1# configurations
2.idea
3
4# crawlee and apify storage folders
5apify_storage
6crawlee_storage
7storage
8
9# installed files
10node_modules
11
12# git folder
13.git
.editorconfig
1root = true
2
3[*]
4indent_style = space
5indent_size = 4
6charset = utf-8
7trim_trailing_whitespace = true
8insert_final_newline = true
9end_of_line = lf
.eslintrc
1{
2 "extends": "@apify",
3 "root": true
4}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.DS_Store
4.idea
5dist
6node_modules
7apify_storage
8storage
package.json
1{
2 "name": "my-web-scrapper",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "engines": {
7 "node": ">=18.0.0"
8 },
9 "dependencies": {
10 "@crawlee/http": "^3.9.2",
11 "apify": "^3.1.10",
12 "apify-client": "^2.9.3",
13 "axios": "^1.5.0",
14 "cheerio": "^1.0.0-rc.12",
15 "crawlee": "^3.9.2",
16 "random-useragent": "^0.5.0"
17 },
18 "scripts": {
19 "start": "node ./src/main.js",
20 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
21 },
22 "author": "It's not you it's me",
23 "license": "ISC"
24}
Developer
Maintained by Community
Actor Metrics
1 monthly user
-
1 star
>99% runs succeeded
Created in May 2024
Modified 4 months ago
Categories