Google Search By Image avatar
Google Search By Image

Deprecated

Pricing

Pay per usage

Go to Store
Google Search By Image

Google Search By Image

Deprecated

Developed by

Paulo Cesar

Paulo Cesar

Maintained by Community

Search Google results by providing image URLs. It can be used for: reverse search, finding people by photo, detecting objects on images, getting product names, finding social media related to the person in the photo, finding related images and terms, and identifying objects on photos

0.0 (0)

Pricing

Pay per usage

1

Total users

45

Monthly users

1

Runs succeeded

>99%

Last modified

2 years ago

src/main.ts

1import { Actor } from 'apify';
2import { BasicCrawler, log } from 'crawlee';
3import { gotScraping } from 'got-scraping';
4import FormData from 'form-data';
5import { JSDOM } from 'jsdom';
6import { createHash } from 'crypto';
7
8const createValidKVNameFromUrl = (url: string) => createHash('sha256').update(url).digest('hex').slice(0, -2);
9
10const headers: Record<string, string> = {
11 'Alt-Used': 'www.google.com',
12 'Origin': 'https://www.google.com',
13 'Referer': 'https://www.google.com/',
14 'Cache-Control': 'no-cache',
15 'Pragma': "no-cache",
16};
17
18await Actor.init();
19
20interface InputSchema {
21 startUrls: string[];
22 debug?: boolean;
23}
24
25const { startUrls = [], debug, } = await Actor.getInput<InputSchema>() ?? {};
26
27if (debug) {
28 log.setLevel(log.LEVELS.DEBUG);
29}
30
31const proxyConfiguration = await Actor.createProxyConfiguration({
32 groups: ['RESIDENTIAL'],
33 countryCode: 'US'
34});
35
36const newUrl = (imageUrl: string) => {
37 const nUrl = new URL('https://www.google.com.br/searchbyimage');
38
39 nUrl.searchParams.set('image_url', imageUrl);
40 nUrl.searchParams.set('btnG', 'Search by image');
41 nUrl.searchParams.set('encoded_image', '');
42 nUrl.searchParams.set('image_content', '');
43 nUrl.searchParams.set('filename', '');
44 nUrl.searchParams.set('hl', 'en');
45
46 return nUrl.toString();
47}
48
49const { defaultKeyValueStoreId } = Actor.getEnv();
50
51const crawler = new BasicCrawler({
52 maxConcurrency: 3,
53 useSessionPool: true,
54 async requestHandler({ session, request }) {
55 const { userData } = request;
56
57 const response = await gotScraping({
58 url: newUrl(request.url),
59 method: 'GET',
60 proxyUrl: await proxyConfiguration!.newUrl(session!.id),
61 headers,
62 responseType: 'text',
63 } as any);
64
65 const { window } = new JSDOM(response.body, {
66 url: 'https://www.google.com/search/',
67 runScripts: 'dangerously',
68 pretendToBeVisual: true,
69 });
70
71 const { document } = window;
72
73 if (!document.querySelectorAll('#rso').length) {
74 throw new Error(`No results found`);
75 }
76
77 const relatedSearch = document.querySelector<HTMLAnchorElement>('#topstuff a.fKDtNb[href^="/search"]')?.href;
78
79 const matches = Array.from<HTMLDivElement>(document.querySelectorAll<HTMLDivElement>('#search .normal-header ~ .g')).map((div) => {
80 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
81 const date = info?.querySelector?.('span:nth-child(3)')?.textContent ?? null;
82
83 return {
84 title: div?.querySelector?.('h3')?.textContent,
85 url: div?.querySelector?.('a')?.href,
86 date: /\d, \d/.test(date) ? date : null,
87 text: snippet?.textContent || null,
88 };
89 });
90
91 const results = Array.from<HTMLDivElement>(document.querySelectorAll('#rso > div:first-child .g')).map((div) => {
92 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
93
94 return {
95 title: div?.querySelector?.('h3')?.textContent,
96 url: div?.querySelector?.('a')?.href,
97 date: /\d, \d/.test(info?.textContent) ? info.querySelector('span')?.textContent : null,
98 text: snippet?.textContent || null,
99 };
100 });
101
102 const images = Array.from<HTMLDivElement>(document.querySelectorAll('[data-lpage]')).map((div) => {
103 return {
104 image: '',
105 url: div?.dataset?.['lpage'],
106 imageData: div?.querySelector?.('img')?.src,
107 };
108 });
109
110 for (const image of images) {
111 if (!image.imageData || !image.url) {
112 continue;
113 }
114
115 const hash = createValidKVNameFromUrl(image.url);
116 const [, contentType, imageData] = image.imageData.split(/data:|;base64,/);
117
118 await Actor.setValue(
119 hash,
120 Buffer.from(imageData, 'base64'),
121 { contentType }
122 );
123
124 delete image.imageData;
125 image.image = `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${hash}`;
126 }
127
128 await Actor.pushData({
129 relatedSearch: relatedSearch || null,
130 matches,
131 results,
132 images,
133 userData,
134 });
135
136 // await Actor.setValue('OUTPUT', dom.serialize(), { contentType: 'text/plain' });
137 },
138});
139
140await crawler.addRequests(startUrls);
141
142log.info('Starting the crawl.');
143await crawler.run();
144log.info('Crawl finished.');
145
146await Actor.exit();

crawlee_storage/key_value_stores/default/INPUT.json

{
"startUrls": [
"https://apify.com"
]
}

.gitignore

# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
apify_storage
crawlee_storage

Dockerfile

# using multistage build, as we need dev deps to build the TS source code
FROM apify/actor-node:18-beta AS builder
# copy all files, install all dependencies (including dev deps) and build the project
COPY . ./
RUN npm install --include=dev \
&& npm run build
# create final image
FROM apify/actor-node:18-beta
# copy only necessary files
COPY --from=builder /usr/src/app/package.json ./
COPY --from=builder /usr/src/app/README.md ./
COPY --from=builder /usr/src/app/dist ./dist
COPY --from=builder /usr/src/app/apify.json ./apify.json
COPY --from=builder /usr/src/app/INPUT_SCHEMA.json ./INPUT_SCHEMA.json
# install only prod deps
RUN npm --quiet set progress=false \
&& npm install --only=prod --no-optional \
&& echo "Installed NPM packages:" \
&& (npm list --only=prod --no-optional --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# run compiled code
CMD npm run start:prod

INPUT_SCHEMA.json

{
"title": "@crawlee/cheerio typescript template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Image URLs",
"type": "array",
"description": "Image URLs to search for",
"editor": "requestListSources",
"prefill": [
{
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dc/Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg/1024px-Steve_Jobs_Headshot_2010-CROP_%28cropped_2%29.jpg",
"userData": {
"name": "Steve Jobs"
}
}
]
},
"debug": {
"title": "Debug mode",
"type": "boolean",
"description": "Enable additional logging",
"editor": "checkbox",
"default": false
}
},
"required": [
"startUrls"
]
}

apify.json

{
"name": "crawlee-cheerio-typescript",
"title": "Crawlee + Cheerio + Typescript",
"version": "0.0",
"buildTag": "latest"
}

package.json

{
"name": "crawlee-cheerio-typescript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify actor.",
"dependencies": {
"apify": "^3.0.2",
"crawlee": "^3.0.3",
"got-scraping": "^3.2.10",
"form-data": "^4.0.0"
},
"devDependencies": {
"@apify/tsconfig": "^0.1.0",
"ts-node": "^10.9.1",
"typescript": "^4.7.4"
},
"scripts": {
"start": "npm run start:dev",
"start:prod": "node dist/main.js",
"start:dev": "ts-node-esm -T src/main.ts",
"build": "tsc",
"test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
},
"author": "It's not you it's me",
"license": "ISC"
}

payload

-----------------------------324172699818326226173603017879
Content-Disposition: form-data; name="image_url"
-----------------------------324172699818326226173603017879
Content-Disposition: form-data; name="encoded_image"; filename=""
Content-Type: application/octet-stream
-----------------------------324172699818326226173603017879
Content-Disposition: form-data; name="image_content"
base64
-----------------------------324172699818326226173603017879
Content-Disposition: form-data; name="filename"
x.jpg
-----------------------------324172699818326226173603017879
Content-Disposition: form-data; name="hl"
en-BR
-----------------------------324172699818326226173603017879--

tsconfig.json

{
"extends": "@apify/tsconfig",
"compilerOptions": {
"module": "ES2022",
"target": "ES2022",
"outDir": "dist",
"noUnusedLocals": false,
"lib": ["DOM"]
},
"include": [
"./src/**/*"
]
}