Google Search By Image
DeprecatedView all Actors
This Actor is unavailable because the developer has decided to deprecate it. Would you like to try a similar Actor instead?
See alternative ActorsGoogle Search By Image
pocesar/google-search-by-image
Search Google results by providing image URLs. It can be used for: reverse search, finding people by photo, detecting objects on images, getting product names, finding social media related to the person in the photo, finding related images and terms, and identifying objects on photos
src/main.ts
1import { Actor } from 'apify';
2import { BasicCrawler, log } from 'crawlee';
3import { gotScraping } from 'got-scraping';
4import FormData from 'form-data';
5import { JSDOM } from 'jsdom';
6import { createHash } from 'crypto';
7
8const createValidKVNameFromUrl = (url: string) => createHash('sha256').update(url).digest('hex').slice(0, -2);
9
10const headers: Record<string, string> = {
11 'Alt-Used': 'www.google.com',
12 'Origin': 'https://www.google.com',
13 'Referer': 'https://www.google.com/',
14 'Cache-Control': 'no-cache',
15 'Pragma': "no-cache",
16};
17
18await Actor.init();
19
20interface InputSchema {
21 startUrls: string[];
22 debug?: boolean;
23}
24
25const { startUrls = [], debug, } = await Actor.getInput<InputSchema>() ?? {};
26
27if (debug) {
28 log.setLevel(log.LEVELS.DEBUG);
29}
30
31const proxyConfiguration = await Actor.createProxyConfiguration({
32 groups: ['RESIDENTIAL'],
33 countryCode: 'US'
34});
35
36const newUrl = (imageUrl: string) => {
37 const nUrl = new URL('https://www.google.com.br/searchbyimage');
38
39 nUrl.searchParams.set('image_url', imageUrl);
40 nUrl.searchParams.set('btnG', 'Search by image');
41 nUrl.searchParams.set('encoded_image', '');
42 nUrl.searchParams.set('image_content', '');
43 nUrl.searchParams.set('filename', '');
44 nUrl.searchParams.set('hl', 'en');
45
46 return nUrl.toString();
47}
48
49const { defaultKeyValueStoreId } = Actor.getEnv();
50
51const crawler = new BasicCrawler({
52 maxConcurrency: 3,
53 useSessionPool: true,
54 async requestHandler({ session, request }) {
55 const { userData } = request;
56
57 const response = await gotScraping({
58 url: newUrl(request.url),
59 method: 'GET',
60 proxyUrl: await proxyConfiguration!.newUrl(session!.id),
61 headers,
62 responseType: 'text',
63 } as any);
64
65 const { window } = new JSDOM(response.body, {
66 url: 'https://www.google.com/search/',
67 runScripts: 'dangerously',
68 pretendToBeVisual: true,
69 });
70
71 const { document } = window;
72
73 if (!document.querySelectorAll('#rso').length) {
74 throw new Error(`No results found`);
75 }
76
77 const relatedSearch = document.querySelector<HTMLAnchorElement>('#topstuff a.fKDtNb[href^="/search"]')?.href;
78
79 const matches = Array.from<HTMLDivElement>(document.querySelectorAll<HTMLDivElement>('#search .normal-header ~ .g')).map((div) => {
80 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
81 const date = info?.querySelector?.('span:nth-child(3)')?.textContent ?? null;
82
83 return {
84 title: div?.querySelector?.('h3')?.textContent,
85 url: div?.querySelector?.('a')?.href,
86 date: /\d, \d/.test(date) ? date : null,
87 text: snippet?.textContent || null,
88 };
89 });
90
91 const results = Array.from<HTMLDivElement>(document.querySelectorAll('#rso > div:first-child .g')).map((div) => {
92 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
93
94 return {
95 title: div?.querySelector?.('h3')?.textContent,
96 url: div?.querySelector?.('a')?.href,
97 date: /\d, \d/.test(info?.textContent) ? info.querySelector('span')?.textContent : null,
98 text: snippet?.textContent || null,
99 };
100 });
101
102 const images = Array.from<HTMLDivElement>(document.querySelectorAll('[data-lpage]')).map((div) => {
103 return {
104 image: '',
105 url: div?.dataset?.['lpage'],
106 imageData: div?.querySelector?.('img')?.src,
107 };
108 });
109
110 for (const image of images) {
111 if (!image.imageData || !image.url) {
112 continue;
113 }
114
115 const hash = createValidKVNameFromUrl(image.url);
116 const [, contentType, imageData] = image.imageData.split(/data:|;base64,/);
117
118 await Actor.setValue(
119 hash,
120 Buffer.from(imageData, 'base64'),
121 { contentType }
122 );
123
124 delete image.imageData;
125 image.image = `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${hash}`;
126 }
127
128 await Actor.pushData({
129 relatedSearch: relatedSearch || null,
130 matches,
131 results,
132 images,
133 userData,
134 });
135
136 // await Actor.setValue('OUTPUT', dom.serialize(), { contentType: 'text/plain' });
137 },
138});
139
140await crawler.addRequests(startUrls);
141
142log.info('Starting the crawl.');
143await crawler.run();
144log.info('Crawl finished.');
145
146await Actor.exit();
crawlee_storage/key_value_stores/default/INPUT.json
1{
2 "startUrls": [
3 "https://apify.com"
4 ]
5}
.gitignore
1# This file tells Git which files shouldn't be added to source control
2
3.idea
4dist
5node_modules
6apify_storage
7crawlee_storage
Dockerfile
1# using multistage build, as we need dev deps to build the TS source code
2FROM apify/actor-node:18-beta AS builder
3
4# copy all files, install all dependencies (including dev deps) and build the project
5COPY . ./
6RUN npm install --include=dev \
7 && npm run build
8
9# create final image
10FROM apify/actor-node:18-beta
11# copy only necessary files
12COPY /usr/src/app/package.json ./
13COPY /usr/src/app/README.md ./
14COPY /usr/src/app/dist ./dist
15COPY /usr/src/app/apify.json ./apify.json
16COPY /usr/src/app/INPUT_SCHEMA.json ./INPUT_SCHEMA.json
17
18# install only prod deps
19RUN npm --quiet set progress=false \
20 && npm install --only=prod --no-optional \
21 && echo "Installed NPM packages:" \
22 && (npm list --only=prod --no-optional --all || true) \
23 && echo "Node.js version:" \
24 && node --version \
25 && echo "NPM version:" \
26 && npm --version
27
28# run compiled code
29CMD npm run start:prod
INPUT_SCHEMA.json
1{
2 "title": "@crawlee/cheerio typescript template",
3 "type": "object",
4 "schemaVersion": 1,
5 "properties": {
6 "startUrls": {
7 "title": "Image URLs",
8 "type": "array",
9 "description": "Image URLs to search for",
10 "editor": "requestListSources",
11 "prefill": [
12 {
13 "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dc/Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg/1024px-Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg",
14 "userData": {
15 "name": "Steve Jobs"
16 }
17 }
18 ]
19 },
20 "debug": {
21 "title": "Debug mode",
22 "type": "boolean",
23 "description": "Enable additional logging",
24 "editor": "checkbox",
25 "default": false
26 }
27 },
28 "required": [
29 "startUrls"
30 ]
31}
apify.json
1{
2 "name": "crawlee-cheerio-typescript",
3 "title": "Crawlee + Cheerio + Typescript",
4 "version": "0.0",
5 "buildTag": "latest"
6}
package.json
1{
2 "name": "crawlee-cheerio-typescript",
3 "version": "0.0.1",
4 "type": "module",
5 "description": "This is an example of an Apify actor.",
6 "dependencies": {
7 "apify": "^3.0.2",
8 "crawlee": "^3.0.3",
9 "got-scraping": "^3.2.10",
10 "form-data": "^4.0.0"
11 },
12 "devDependencies": {
13 "@apify/tsconfig": "^0.1.0",
14 "ts-node": "^10.9.1",
15 "typescript": "^4.7.4"
16 },
17 "scripts": {
18 "start": "npm run start:dev",
19 "start:prod": "node dist/main.js",
20 "start:dev": "ts-node-esm -T src/main.ts",
21 "build": "tsc",
22 "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
23 },
24 "author": "It's not you it's me",
25 "license": "ISC"
26}
payload
1-----------------------------324172699818326226173603017879
2Content-Disposition: form-data; name="image_url"
3
4
5-----------------------------324172699818326226173603017879
6Content-Disposition: form-data; name="encoded_image"; filename=""
7Content-Type: application/octet-stream
8
9
10-----------------------------324172699818326226173603017879
11Content-Disposition: form-data; name="image_content"
12
13base64
14-----------------------------324172699818326226173603017879
15Content-Disposition: form-data; name="filename"
16
17x.jpg
18-----------------------------324172699818326226173603017879
19Content-Disposition: form-data; name="hl"
20
21en-BR
22-----------------------------324172699818326226173603017879--
tsconfig.json
1{
2 "extends": "@apify/tsconfig",
3 "compilerOptions": {
4 "module": "ES2022",
5 "target": "ES2022",
6 "outDir": "dist",
7 "noUnusedLocals": false,
8 "lib": ["DOM"]
9 },
10 "include": [
11 "./src/**/*"
12 ]
13}
Developer
Maintained by Community
Categories