1import { Actor } from 'apify';
2import { BasicCrawler, log } from 'crawlee';
3import { gotScraping } from 'got-scraping';
4import FormData from 'form-data';
5import { JSDOM } from 'jsdom';
6import { createHash } from 'crypto';
7
8const createValidKVNameFromUrl = (url: string) => createHash('sha256').update(url).digest('hex').slice(0, -2);
9
10const headers: Record<string, string> = {
11 'Alt-Used': 'www.google.com',
12 'Origin': 'https://www.google.com',
13 'Referer': 'https://www.google.com/',
14 'Cache-Control': 'no-cache',
15 'Pragma': "no-cache",
16};
17
18await Actor.init();
19
20interface InputSchema {
21 startUrls: string[];
22 debug?: boolean;
23}
24
25const { startUrls = [], debug, } = await Actor.getInput<InputSchema>() ?? {};
26
27if (debug) {
28 log.setLevel(log.LEVELS.DEBUG);
29}
30
31const proxyConfiguration = await Actor.createProxyConfiguration({
32 groups: ['RESIDENTIAL'],
33 countryCode: 'US'
34});
35
36const newUrl = (imageUrl: string) => {
37 const nUrl = new URL('https://www.google.com.br/searchbyimage');
38
39 nUrl.searchParams.set('image_url', imageUrl);
40 nUrl.searchParams.set('btnG', 'Search by image');
41 nUrl.searchParams.set('encoded_image', '');
42 nUrl.searchParams.set('image_content', '');
43 nUrl.searchParams.set('filename', '');
44 nUrl.searchParams.set('hl', 'en');
45
46 return nUrl.toString();
47}
48
49const { defaultKeyValueStoreId } = Actor.getEnv();
50
51const crawler = new BasicCrawler({
52 maxConcurrency: 3,
53 useSessionPool: true,
54 async requestHandler({ session, request }) {
55 const { userData } = request;
56
57 const response = await gotScraping({
58 url: newUrl(request.url),
59 method: 'GET',
60 proxyUrl: await proxyConfiguration!.newUrl(session!.id),
61 headers,
62 responseType: 'text',
63 } as any);
64
65 const { window } = new JSDOM(response.body, {
66 url: 'https://www.google.com/search/',
67 runScripts: 'dangerously',
68 pretendToBeVisual: true,
69 });
70
71 const { document } = window;
72
73 if (!document.querySelectorAll('#rso').length) {
74 throw new Error(`No results found`);
75 }
76
77 const relatedSearch = document.querySelector<HTMLAnchorElement>('#topstuff a.fKDtNb[href^="/search"]')?.href;
78
79 const matches = Array.from<HTMLDivElement>(document.querySelectorAll<HTMLDivElement>('#search .normal-header ~ .g')).map((div) => {
80 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
81 const date = info?.querySelector?.('span:nth-child(3)')?.textContent ?? null;
82
83 return {
84 title: div?.querySelector?.('h3')?.textContent,
85 url: div?.querySelector?.('a')?.href,
86 date: /\d, \d/.test(date) ? date : null,
87 text: snippet?.textContent || null,
88 };
89 });
90
91 const results = Array.from<HTMLDivElement>(document.querySelectorAll('#rso > div:first-child .g')).map((div) => {
92 const [info, snippet] = Array.from(div.querySelectorAll('[data-content-feature="1"] > div > span'));
93
94 return {
95 title: div?.querySelector?.('h3')?.textContent,
96 url: div?.querySelector?.('a')?.href,
97 date: /\d, \d/.test(info?.textContent) ? info.querySelector('span')?.textContent : null,
98 text: snippet?.textContent || null,
99 };
100 });
101
102 const images = Array.from<HTMLDivElement>(document.querySelectorAll('[data-lpage]')).map((div) => {
103 return {
104 image: '',
105 url: div?.dataset?.['lpage'],
106 imageData: div?.querySelector?.('img')?.src,
107 };
108 });
109
110 for (const image of images) {
111 if (!image.imageData || !image.url) {
112 continue;
113 }
114
115 const hash = createValidKVNameFromUrl(image.url);
116 const [, contentType, imageData] = image.imageData.split(/data:|;base64,/);
117
118 await Actor.setValue(
119 hash,
120 Buffer.from(imageData, 'base64'),
121 { contentType }
122 );
123
124 delete image.imageData;
125 image.image = `https://api.apify.com/v2/key-value-stores/${defaultKeyValueStoreId}/records/${hash}`;
126 }
127
128 await Actor.pushData({
129 relatedSearch: relatedSearch || null,
130 matches,
131 results,
132 images,
133 userData,
134 });
135
136
137 },
138});
139
140await crawler.addRequests(startUrls);
141
142log.info('Starting the crawl.');
143await crawler.run();
144log.info('Crawl finished.');
145
146await Actor.exit();