skynet-scrapper
Pricing
Pay per usage
Go to Store
skynet-scrapper
0.0 (0)
Pricing
Pay per usage
1
Total users
7
Monthly users
2
Runs succeeded
>99%
Last modified
8 months ago
.actor/Dockerfile
# Specify the base Docker image. You can read more about# the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images# You can also use any other image from Docker Hub.FROM apify/actor-node:20
# Copy just package.json and package-lock.json# to speed up the build using Docker layer cache.COPY package*.json ./
# Install NPM packages, skip optional and development dependencies to# keep the image small. Avoid logging too much and print the dependency# tree for debuggingRUN npm --quiet set progress=false \ && npm install --omit=dev --omit=optional \ && echo "Installed NPM packages:" \ && (npm list --omit=dev --all || true) \ && echo "Node.js version:" \ && node --version \ && echo "NPM version:" \ && npm --version \ && rm -r ~/.npm
# Next, copy the remaining files and directories with the source code.# Since we do this after NPM install, quick build will be really fast# for most source file changes.COPY . ./
# Run the image.CMD npm run start
.actor/actor.json
{ "actorSpecification": 1, "name": "skinet-scraper", "title": "skinet scraper", "version": "1.0.0", "input": "./input_schema.json", "dockerfile": "./Dockerfile", "storages": { "dataset": "./dataset_schema.json" }}
.actor/dataset_schema.json
{ "actorSpecification": 1, "fields": {}, "views": { "overview": { "title": "Overview", "transformation": {}, "display": { "component": "table" } } }}
.actor/input_schema.json
{ "title": "Scrape data from a web page", "type": "object", "schemaVersion": 1, "properties": { "queries": { "title": "Search Query", "type": "string", "description": "Search query to use on Google", "editor": "textfield", "prefill": "[tow truck near me california]" }, "maxRequestsPerCrawl": { "title": "Max Requests per Crawl", "type": "integer", "description": "Maximum number of requests per crawl", "editor": "number", "prefill": 200 } }, "required": ["queries"]}
.actor/output_schema.json
{ "actorSpecification": 1, "name": "skynet-scraper", "title": "skynet Scraper", "description": "", "version": "1.0.0", "properties": { "url": { "type": "string", "title": "URL", "description": "URL to scrape", "required": true }, "title": { "type": "string", "title": "title", "description": "title to scrape", "required": true }, "phoneNumber": { "type": "string", "title": "phoneNumber", "description": "phoneNumber to scrape", "required": true } }, "fields": {}, "views": { "overview": { "title": "Overview", "transformation": {}, "display": {} } } }
src/main.js
1import { Actor, Dataset } from "apify";2import { CheerioCrawler } from "crawlee";3import fs from "fs";4
5try {6 await Actor.init();7
8 const input = await Actor.getInput();9 let { queries, maxRequestsPerCrawl } = input;10 if (typeof queries === "string") {11 queries = queries.replace(/^\[|\]$/g, "");12 queries = `[${queries}]`;13 }14 const searchQueries = Array.isArray(queries) ? queries : [queries];15 const searchQuery = searchQueries.join(" ");16 const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(17 searchQuery18 )}`;19 console.log("URL ", searchUrl);20
21 const phoneNumberRegex =22 /(\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})/;23
24 const resultsData = {25 searchQuery: {26 term: queries,27 url: searchUrl,28 device: "MOBILE",29 page: 1,30 type: "SEARCH",31 domain: "google.com",32 countryCode: "US",33 languageCode: "en",34 locationUule: null,35 resultsPerPage: 10,36 },37 resultsTotal: "N/A",38 relatedQueries: [],39 paidResults: [],40 paidProducts: [],41 organicResults: [],42 peopleAlsoAsk: [],43 };44
45 const crawler = new CheerioCrawler({46 maxRequestsPerCrawl,47 handlePageFunction: async ({ request, response, $, log }) => {48 const searchResults = $("div.g, div.uEierd"); // "div.uEierd" is used for some ad blocks49 console.log("search results:", searchResults.length);50
51 if (searchResults.length === 0) {52 console.log("No search results were found.");53 }54
55 searchResults.each((index, element) => {56 const $result = $(element);57 let title =58 $result.find("h3").text().trim() ||59 $result.find(".xA33Gc").text().trim(); // Adjusted selector60 const url = $result.find("a").attr("href") || "";61 //const description = $result.find('span.VwiC3b').text().trim() || '';62 const textContent = $result.text();63 const description = extractDescription(textContent);64 let phoneNumber = null;65
66 // search phonenumber - "Call us"67 const callElements = $result.find('*:contains("Call us")');68 callElements.each((i, callElement) => {69 const callText = $(callElement).text();70 const phoneMatch = callText.match(phoneNumberRegex);71 if (phoneMatch) {72 phoneNumber = phoneMatch[0];73 }74 });75
76 // Identify if the result is sponsored77 const isSponsored =78 $result.hasClass("uEierd") ||79 $result.find("span").text().toLowerCase().includes("ad");80 let advertiserName = $result81 .find("div.specific-class-for-advertiser-name")82 .text()83 .trim(); // Ejemplo84
85 const [advertiserNameTrim, phoneNumberTrim] = title86 .split("|~|")87 .map((str) => str.trim());88
89 if ((title || url || description) && phoneNumber) {90 if (!title && advertiserName) {91 // Usa un delimitador único para separar el nombre del anunciante y el número de teléfono92 title = `${advertiserName} |~| ${phoneNumber}`;93 } else if (advertiserName) {94 title = `${advertiserName} |~| ${phoneNumber}`;95 } else {96 title = `${phoneNumber}`;97 }98
99 console.log("title:", title);100
101 if (isSponsored) {102 resultsData.paidResults.push({103 title,104 url,105 phoneNumber,106 displayedUrl: url,107 description,108 emphasizedKeywords: [],109 siteLinks: [],110 type: "paid",111 adPosition: index + 1,112 advertiserName: advertiserNameTrim,113 });114 } else {115 resultsData.organicResults.push({116 title,117 url,118 phoneNumber,119 displayedUrl: url,120 description,121 emphasizedKeywords: [],122 siteLinks: [],123 productInfo: {},124 type: "organic",125 position: index + 1,126 advertiserName: advertiserNameTrim,127 });128 }129 } else {130 console.log(`Resultado ${index + 1} está vacío o incompleto.`);131 }132 });133
134 // People Also Ask135 const peopleAlsoAskElements = $("div.related-question-pair");136 if (peopleAlsoAskElements.length === 0) {137 console.log("No se encontraron 'People Also Ask'.");138 }139
140 peopleAlsoAskElements.each((index, element) => {141 const $question = $(element).find(".yuRUbf").text().trim();142 const $answer = $(element).find(".VwiC3b").text().trim();143 const $url = $(element).find("a").attr("href") || "";144
145 if ($question || $answer || $url) {146 resultsData.peopleAlsoAsk.push({147 question: $question,148 answer: $answer,149 url: $url,150 title: $question,151 date: "",152 });153 } else {154 console.log(155 `'People Also Ask' ${index + 1} está vacío o incompleto.`156 );157 }158 });159
160 // Related Queries161 const relatedQueriesElements = $('a[data-hveid="CAEQAw"]');162 if (relatedQueriesElements.length === 0) {163 console.log("No se encontraron 'Related Queries'.");164 }165
166 relatedQueriesElements.each((index, element) => {167 const $relatedQuery = $(element).text().trim();168 const $relatedUrl = $(element).attr("href") || "";169
170 if ($relatedQuery || $relatedUrl) {171 resultsData.relatedQueries.push({172 title: $relatedQuery,173 url: `https://www.google.com${$relatedUrl}`,174 });175 } else {176 console.log(`'Related Query' ${index + 1} está vacío o incompleto.`);177 }178 });179
180 // Handling pagination: Check for the next page link181 const nextPageLink = $("a#pnnext").attr("href");182 if (nextPageLink && request.userData.page < 25) {183 // Limit to 25 pages for this example184 await crawler.addRequests([185 {186 url: `https://www.google.com${nextPageLink}`,187 userData: { page: request.userData.page + 1 },188 },189 ]);190 }191 },192 });193
194 const extractDescription = (textContent) => {195 // Elimina URLs y todo el texto antes de la URL196 const cleanedText = textContent197 .replace(/https?:\/\/[^\s]+|\/\/[^\s]+/g, "")198 .trim();199
200 // Elimina cualquier texto al final que no sea necesario (por ejemplo, el texto después de '›' o ' · ')201 const description = cleanedText202 .replace(/(?: ›| · | - |:| · )[^›]*$/, "")203 .trim();204
205 return description;206 };207
208 await crawler.run([209 {210 url: searchUrl,211 userData: { page: 1 },212 },213 ]);214
215 if (!resultsData.paidResults.length && !resultsData.organicResults.length) {216 console.log("No se encontraron resultados de búsqueda.");217 }218
219 // save result dataset of Apify220 await Actor.setValue("OUTPUT", resultsData);221 await Dataset.pushData(resultsData);222
223 // write file JSON224 fs.writeFileSync("./output.json", JSON.stringify(resultsData, null, 2));225 console.log("Datos guardados en output.json");226
227 await Actor.exit();228} catch (error) {229 console.error("Se produjo un error durante la ejecución del actor:", error);230 process.exit(1);231}
.dockerignore
# configurations.idea
# crawlee and apify storage foldersapify_storagecrawlee_storagestorage
# installed filesnode_modules
# git folder.git
.editorconfig
root = true
[*]indent_style = spaceindent_size = 4charset = utf-8trim_trailing_whitespace = trueinsert_final_newline = trueend_of_line = lf
.eslintrc
{ "extends": "@apify", "root": true}
.gitignore
# This file tells Git which files shouldn't be added to source control
.DS_Store.ideadistnode_modulesapify_storagestorage
package.json
{ "name": "my-web-scrapper", "version": "0.0.1", "type": "module", "description": "This is an example of an Apify actor.", "engines": { "node": ">=18.0.0" }, "dependencies": { "@crawlee/http": "^3.9.2", "apify": "^3.1.10", "apify-client": "^2.9.3", "axios": "^1.5.0", "cheerio": "^1.0.0-rc.12", "crawlee": "^3.9.2", "random-useragent": "^0.5.0" }, "scripts": { "start": "node ./src/main.js", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1" }, "author": "It's not you it's me", "license": "ISC"}