1import { load } from 'cheerio';
2import { extractJsonLd, findByType, extractMicrodata } from './lib/jsonld.js';
3import { firstString, toArray, parsePrice, cleanText, absoluteUrl, normalizeDate, currencyFromSymbol } from './lib/normalize.js';
4
5const PERIOD_MAP = [
6 [/hour|hourly|hr/i, 'hour'],
7 [/day|daily/i, 'day'],
8 [/week/i, 'week'],
9 [/month/i, 'month'],
10 [/year|yearly|annum|annual/i, 'year'],
11];
12function normalizeSalaryPeriod(unitText, value) {
13 const u = firstString(unitText);
14 for (const [re, out] of PERIOD_MAP) if (re.test(u)) return out;
15 if (u) return u.toLowerCase();
16
17 if (value != null && value > 0 && value < 250) return 'hour';
18 if (value != null && value >= 1000) return 'year';
19 return null;
20}
21
22
23
24
25
26
27const FIELD_KEYS = ['title', 'company', 'location', 'remote', 'salary_min', 'employment_type', 'date_posted', 'apply_url'];
28
29function blank() {
30 return {
31 status: 'ok', requested_url: null, final_url: null, http_status: null, redirected: false,
32 found: false, complete: false, page_type: 'unknown', source: 'none', render_required: false,
33 title: null, company: null, company_logo: null, location: null, remote: null,
34 employment_type: null, salary_min: null, salary_max: null, salary_currency: null, salary_period: null,
35 date_posted: null, valid_through: null, description: null, identifier: null, apply_url: null,
36 fields_found: [], missing_reason: null, error: null, extracted_at: null,
37 };
38}
39
40export function emptyRecord(base) {
41 return { ...blank(), status: 'error', missing_reason: 'fetch_error', extracted_at: new Date().toISOString(), ...base };
42}
43
44function metaContent($, names) {
45 for (const n of names) {
46 const v = $(`meta[property="${n}"], meta[name="${n}"]`).attr('content');
47 if (v && v.trim()) return v.trim();
48 }
49 return '';
50}
51
52function normUrl(u, base) {
53 try {
54 const x = new URL(u, base);
55 return (x.host + x.pathname).replace(/\/+$/, '').toLowerCase();
56 } catch {
57 return '';
58 }
59}
60
61function formatLocation(jobLocation) {
62 for (const loc of toArray(jobLocation)) {
63 if (!loc) continue;
64 if (typeof loc === 'string') return loc.trim();
65 if (typeof loc.address === 'string') return loc.address.trim();
66 const addr = loc.address && typeof loc.address === 'object' ? loc.address : loc;
67 if (typeof addr === 'string') return addr.trim();
68 const parts = [addr.addressLocality, addr.addressRegion, addr.addressCountry].map(firstString).filter(Boolean);
69 if (parts.length) return [...new Set(parts)].join(', ');
70 }
71 return '';
72}
73
74function parseSalary(bs) {
75 if (!bs || typeof bs !== 'object') return { min: null, max: null, currency: '', period: null };
76 const b = Array.isArray(bs) ? bs[0] : bs;
77 let currency = firstString(b.currency) || firstString(b.salaryCurrency);
78 const v = b.value;
79 let min = null;
80 let max = null;
81 let unit = null;
82 if (Array.isArray(v)) {
83 const nums = v.map((x) => parsePrice(x && typeof x === 'object' ? (x.value ?? x.minValue ?? x.maxValue) : x)).filter((n) => n != null);
84 min = nums.length ? Math.min(...nums) : null;
85 max = nums.length ? Math.max(...nums) : null;
86 } else if (v && typeof v === 'object') {
87 min = parsePrice(v.minValue != null ? v.minValue : v.value);
88 max = parsePrice(v.maxValue != null ? v.maxValue : v.value);
89 unit = v.unitText;
90 } else if (typeof v === 'string') {
91
92 const parts = v.split(/\s*(?:-|–|—|to)\s*/i).map((s) => parsePrice(s)).filter((n) => n != null);
93 min = parts.length ? Math.min(...parts) : null;
94 max = parts.length ? Math.max(...parts) : null;
95 if (!currency) {
96 const sym = currencyFromSymbol(v);
97 if (sym) currency = sym.iso;
98 }
99 } else {
100 const p = parsePrice(v);
101 min = p;
102 max = p;
103 }
104 if (min != null && max != null && min > max) [min, max] = [max, min];
105 return { min, max, currency, period: normalizeSalaryPeriod(unit, min) };
106}
107
108function pickMain(jobs, $, finalUrl) {
109 if (jobs.length <= 1) return jobs[0] || null;
110 const canon = $('link[rel="canonical"]').attr('href') || metaContent($, ['og:url']) || finalUrl || '';
111 const cn = normUrl(canon, finalUrl);
112 if (cn) {
113 const m = jobs.find((j) => [j['@id'], j.url].filter(Boolean).some((u) => normUrl(u, finalUrl) === cn));
114 if (m) return m;
115 }
116
117 const withId = jobs.filter((j) => j['@id'] || j.url);
118 if (withId.length === 1) return withId[0];
119
120 const pageTitle = (firstString($('title').first().text()) || metaContent($, ['og:title'])).toLowerCase();
121 if (pageTitle) {
122 const m = jobs.find((j) => {
123 const t = firstString(j.title).toLowerCase();
124 return t && (pageTitle.includes(t) || t.includes(pageTitle.slice(0, 30)));
125 });
126 if (m) return m;
127 }
128 return null;
129}
130
131function looksClientRendered($) {
132 const scripts = $('script').text();
133 if (/__NEXT_DATA__|__NUXT__|window\.__INITIAL_STATE__|data-reactroot|data-reacthelmet/.test(scripts)) return true;
134 const root = $('#root, #app, #__next, [data-reactroot]').first();
135 if (root.length > 0 && root.text().trim().length < 20) return true;
136
137 const bodyWords = $('body').clone().find('script,style,noscript').remove().end().text().replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length;
138 const srcScripts = $('script[src]').length;
139 return bodyWords < 40 && srcScripts >= 3;
140}
141
142export function extractFromCheerio($, ctx = {}) {
143 const finalUrl = ctx.finalUrl || ctx.url || '';
144 const requestedUrl = ctx.requestedUrl || finalUrl;
145
146 const ldObjects = extractJsonLd($);
147 const jobs = findByType(ldObjects, ['JobPosting']);
148 const listNodes = findByType(ldObjects, ['ItemList', 'CollectionPage', 'SearchResultsPage']);
149 const isListing = jobs.length > 1 || listNodes.length > 0;
150
151 let source = 'none';
152 let pageType = 'unknown';
153 const out = {};
154
155 const main = jobs.length ? pickMain(jobs, $, finalUrl) : null;
156
157 if (main) {
158 source = 'json-ld';
159 pageType = 'job';
160 out.title = firstString(main.title) || null;
161 out.company = firstString(main.hiringOrganization) || null;
162 out.company_logo = absoluteUrl(firstString(main.hiringOrganization && main.hiringOrganization.logo), finalUrl) || null;
163 out.location = formatLocation(main.jobLocation) || null;
164
165
166 const locTypes = toArray(main.jobLocationType).map((s) => firstString(s).toUpperCase());
167 out.remote = locTypes.includes('TELECOMMUTE') ? true : out.location ? false : null;
168 out.employment_type = toArray(main.employmentType).map(firstString).filter(Boolean).join(', ') || null;
169 const sal = parseSalary(main.baseSalary);
170 out.salary_min = sal.min;
171 out.salary_max = sal.max;
172 out.salary_currency = sal.currency || null;
173 out.salary_period = sal.period;
174 out.date_posted = normalizeDate(main.datePosted);
175 out.valid_through = normalizeDate(main.validThrough);
176 out.description = cleanText(main.description, 800) || null;
177 out.identifier = firstString(main.identifier) || null;
178 out.apply_url = absoluteUrl(firstString(main.url) || finalUrl, finalUrl) || null;
179 } else if (isListing) {
180 pageType = 'listing';
181 }
182
183
184 if (!out.title) {
185 const og = metaContent($, ['og:title']);
186 if (og) out.title = og;
187 }
188 if (!main) {
189 const md = extractMicrodata($, 'schema.org/JobPosting');
190 if (md && (md.title || md.jobTitle)) {
191 source = source === 'none' ? 'microdata' : source;
192 pageType = 'job';
193 out.title = out.title || firstString(md.title || md.jobTitle);
194
195 out.company = out.company || firstString(md.hiringOrganization) || firstString(md.name) || null;
196 const loc = [md.addressLocality, md.addressRegion, md.addressCountry].map(firstString).filter(Boolean);
197 out.location = out.location || (loc.length ? [...new Set(loc)].join(', ') : firstString(md.jobLocation)) || null;
198 out.employment_type = out.employment_type || firstString(md.employmentType) || null;
199 out.date_posted = out.date_posted || normalizeDate(md.datePosted);
200 out.valid_through = out.valid_through || normalizeDate(md.validThrough);
201 }
202 }
203
204 const title = out.title || null;
205 const company = out.company || null;
206 const location = out.location || null;
207 const remote = out.remote ?? null;
208 const found = !!(main || (source === 'microdata' && title));
209 const complete = !!(found && title && company && (location || remote === true));
210
211 const renderRequired = !found && looksClientRendered($);
212 let missingReason = null;
213 if (!found) {
214 if (pageType === 'listing') missingReason = 'listing_page';
215 else if (renderRequired) missingReason = 'js_rendered';
216 else missingReason = 'non_job';
217 }
218
219 const merged = { ...blank(), ...out, title, company, location, remote };
220 const fieldsFound = FIELD_KEYS.filter((k) => {
221 const v = merged[k];
222 return v != null && v !== '';
223 });
224
225 return {
226 ...blank(),
227 ...out,
228 status: 'ok',
229 requested_url: requestedUrl,
230 final_url: finalUrl,
231 http_status: ctx.httpStatus ?? null,
232 redirected: normUrl(requestedUrl, finalUrl) !== normUrl(finalUrl, finalUrl),
233 found,
234 complete,
235 page_type: pageType,
236 source,
237 render_required: renderRequired,
238 title,
239 company,
240 location,
241 remote,
242 fields_found: fieldsFound,
243 missing_reason: missingReason,
244 error: null,
245 extracted_at: ctx.now || new Date().toISOString(),
246 };
247}
248
249export function extractHtml(html, url, ctx = {}) {
250 return extractFromCheerio(load(html), { ...ctx, finalUrl: ctx.finalUrl || url, requestedUrl: ctx.requestedUrl || url });
251}