1import { test } from 'node:test';
2import assert from 'node:assert/strict';
3import { extractHtml, emptyRecord } from '../src/extract.js';
4
5const ld = (obj) => `<html><head><title>j</title><script type="application/ld+json">${JSON.stringify({ '@context': 'https://schema.org', ...obj })}</script></head><body></body></html>`;
6const ldJob = (j) => ld({ '@type': 'JobPosting', ...j });
7
8const FULL = ldJob({
9 title: 'Senior Backend Engineer',
10 hiringOrganization: { '@type': 'Organization', name: 'Acme Corp', logo: 'https://acme.com/logo.png' },
11 jobLocation: { '@type': 'Place', address: { '@type': 'PostalAddress', addressLocality: 'Berlin', addressRegion: 'BE', addressCountry: 'DE' } },
12 employmentType: 'FULL_TIME',
13 baseSalary: { '@type': 'MonetaryAmount', currency: 'EUR', value: { '@type': 'QuantitativeValue', minValue: 70000, maxValue: 95000, unitText: 'YEAR' } },
14 datePosted: '2026-05-01',
15 validThrough: '2026-07-01',
16 identifier: 'JOB-123',
17 url: 'https://acme.com/jobs/be-eng',
18 description: '<p>Build <b>great</b> systems.</p>',
19});
20
21test('full JobPosting extracts all key fields + complete', () => {
22 const r = extractHtml(FULL, 'https://acme.com/jobs/be-eng');
23 assert.equal(r.found, true);
24 assert.equal(r.complete, true);
25 assert.equal(r.page_type, 'job');
26 assert.equal(r.title, 'Senior Backend Engineer');
27 assert.equal(r.company, 'Acme Corp');
28 assert.equal(r.company_logo, 'https://acme.com/logo.png');
29 assert.equal(r.location, 'Berlin, BE, DE');
30 assert.equal(r.employment_type, 'FULL_TIME');
31 assert.equal(r.salary_min, 70000);
32 assert.equal(r.salary_max, 95000);
33 assert.equal(r.salary_currency, 'EUR');
34 assert.equal(r.salary_period, 'year');
35 assert.equal(r.date_posted, '2026-05-01T00:00:00.000Z');
36 assert.equal(r.description, 'Build great systems.');
37});
38
39test('remote (TELECOMMUTE) detected', () => {
40 const r = extractHtml(ldJob({ title: 'Remote Dev', hiringOrganization: { name: 'X' }, jobLocationType: 'TELECOMMUTE' }), 'https://x.com/j');
41 assert.equal(r.remote, true);
42});
43
44test('hybrid role with applicantLocationRequirements is NOT remote', () => {
45 const r = extractHtml(ldJob({ title: 'Hybrid Eng', hiringOrganization: { name: 'Visa' }, jobLocation: { address: { addressLocality: 'Atlanta' } }, jobLocationType: 'Hybrid', applicantLocationRequirements: 'U.S. Applicants Only' }), 'https://x.com/j');
46 assert.equal(r.remote, false);
47});
48
49test('jobLocationType array containing TELECOMMUTE → remote', () => {
50 const r = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, jobLocationType: ['Hybrid', 'TELECOMMUTE'] }), 'https://x.com/j');
51 assert.equal(r.remote, true);
52});
53
54test('hourly salary without unitText is inferred as hour', () => {
55 const r = extractHtml(ldJob({ title: 'Intern', hiringOrganization: { name: 'Visa' }, jobLocation: { address: { addressLocality: 'Atlanta' } }, baseSalary: { currency: 'USD', value: { value: 34 } } }), 'https://x.com/j');
56 assert.equal(r.salary_min, 34);
57 assert.equal(r.salary_period, 'hour');
58});
59
60test('salary period text normalized (YEARLY → year)', () => {
61 const r = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, baseSalary: { currency: 'USD', value: { minValue: 90000, maxValue: 120000, unitText: 'YEARLY' } } }), 'https://x.com/j');
62 assert.equal(r.salary_period, 'year');
63});
64
65test('single job among related-jobs cards (only one has url) is picked, not listing', () => {
66 const r = extractHtml(ld({ '@graph': [{ '@type': 'JobPosting', title: 'Main Role', url: 'https://x.com/main', hiringOrganization: { name: 'X' }, jobLocation: { address: { addressLocality: 'NYC' } } }, { '@type': 'JobPosting', title: 'Related A' }, { '@type': 'JobPosting', title: 'Related B' }] }), 'https://x.com/main');
67 assert.equal(r.found, true);
68 assert.equal(r.title, 'Main Role');
69});
70
71test('single salary value fills both min and max', () => {
72 const r = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, jobLocation: { address: { addressLocality: 'NYC' } }, baseSalary: { currency: 'USD', value: { value: 50, unitText: 'HOUR' } } }), 'https://x.com/j');
73 assert.equal(r.salary_min, 50);
74 assert.equal(r.salary_max, 50);
75 assert.equal(r.salary_period, 'hour');
76});
77
78test('employmentType array is joined', () => {
79 const r = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, jobLocation: { address: { addressLocality: 'NYC' } }, employmentType: ['FULL_TIME', 'CONTRACTOR'] }), 'https://x.com/j');
80 assert.match(r.employment_type, /FULL_TIME/);
81 assert.match(r.employment_type, /CONTRACTOR/);
82});
83
84test('JobPosting inside @graph is found', () => {
85 const r = extractHtml(ld({ '@graph': [{ '@type': 'WebSite' }, { '@type': 'JobPosting', title: 'Graphed Job', hiringOrganization: { name: 'G' } }] }), 'https://x.com/j');
86 assert.equal(r.found, true);
87 assert.equal(r.title, 'Graphed Job');
88});
89
90test('LISTING (multiple jobs, no canonical match) → found=false, page_type listing', () => {
91 const r = extractHtml(ld({ '@graph': [{ '@type': 'ItemList' }, { '@type': 'JobPosting', title: 'A', url: 'https://x.com/a' }, { '@type': 'JobPosting', title: 'B', url: 'https://x.com/b' }] }), 'https://x.com/jobs/');
92 assert.equal(r.found, false);
93 assert.equal(r.page_type, 'listing');
94 assert.equal(r.missing_reason, 'listing_page');
95});
96
97test('LISTING but canonical matches one job → that job picked', () => {
98 const r = extractHtml(
99 `<html><head><link rel="canonical" href="https://x.com/b"><script type="application/ld+json">${JSON.stringify({ '@graph': [{ '@type': 'JobPosting', title: 'A', url: 'https://x.com/a', hiringOrganization: { name: 'X' } }, { '@type': 'JobPosting', title: 'B', url: 'https://x.com/b', hiringOrganization: { name: 'Y' } }] })}</script></head><body></body></html>`,
100 'https://x.com/b',
101 );
102 assert.equal(r.found, true);
103 assert.equal(r.title, 'B');
104});
105
106test('non-job page with only <title> → found=false', () => {
107 const r = extractHtml('<html><head><title>A Blog Post</title><meta property="og:title" content="A Blog Post"></head><body><p>hi</p></body></html>', 'https://blog.x.com/post');
108 assert.equal(r.found, false);
109 assert.equal(r.missing_reason, 'non_job');
110});
111
112test('malformed JSON-LD does not throw', () => {
113 const r = extractHtml(`<html><head><script type="application/ld+json">{ "@type":"JobPosting", bad }</script></head><body></body></html>`, 'https://x.com/bad');
114 assert.equal(r.status, 'ok');
115 assert.equal(r.found, false);
116});
117
118test('ok and error records share the EXACT same keys', () => {
119 const ok = extractHtml(FULL, 'https://acme.com/jobs/be-eng');
120 const err = emptyRecord({ requested_url: 'https://x.com/y', error: 'http 500' });
121 assert.deepEqual(Object.keys(ok).sort(), Object.keys(err).sort());
122 assert.equal(err.status, 'error');
123});
124
125
126test('salary composes min+max, single value, none→null, and keeps ok/error key parity', () => {
127 const range = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, baseSalary: { currency: 'USD', value: { minValue: 150000, maxValue: 185000, unitText: 'YEAR' } } }), 'https://x.com/j');
128 assert.equal(range.salary, '$150k–185k/yr');
129 const single = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' }, baseSalary: { currency: 'USD', value: { value: 150000, unitText: 'YEAR' } } }), 'https://x.com/j');
130 assert.equal(single.salary, '$150k/yr');
131 const none = extractHtml(ldJob({ title: 'T', hiringOrganization: { name: 'Y' } }), 'https://x.com/j');
132 assert.equal(none.salary, null);
133 const err = emptyRecord({ requested_url: 'https://x.com/y' });
134 assert.deepEqual(Object.keys(range).sort(), Object.keys(err).sort());
135});
136
137
138test('empty JobPosting node is not billed as found', () => {
139 const r = extractHtml(ldJob({}), 'https://example.com/job');
140 assert.equal(r.found, false);
141 assert.deepEqual(r.fields_found, []);
142 assert.equal(r.apply_url, null);
143});
144test('apply_url not manufactured from finalUrl', () => {
145 const r = extractHtml(ldJob({ title: 'Eng', hiringOrganization: { name: 'Acme' } }), 'https://example.com/job');
146 assert.equal(r.apply_url, null);
147 assert.equal(r.found, true);
148});
149
150
151test('oversized fields are capped', () => {
152 const big = 'A'.repeat(50000);
153 const r = extractHtml(ldJob({ title: big, hiringOrganization: { name: big }, identifier: big, jobLocation: { address: { addressLocality: big } }, baseSalary: { currency: 'USD', value: { value: 10, unitText: big } } }), 'https://x.com/j');
154 assert.ok(r.title.length <= 300);
155 assert.ok(r.company.length <= 300);
156 assert.ok(r.identifier.length <= 300);
157 assert.ok(r.location.length <= 300);
158 assert.ok(r.salary_period == null || r.salary_period.length <= 32);
159});
160
161
162test('case-differing path counts as redirected', () => {
163 const r = extractHtml(ldJob({ title: 'X', hiringOrganization: { name: 'Y' } }), 'https://x.com/Jobs/Senior', { requestedUrl: 'https://x.com/jobs/senior', finalUrl: 'https://x.com/Jobs/Senior' });
164 assert.equal(r.redirected, true);
165});
166
167
168test('query-only difference counts as redirected', () => {
169 const r = extractHtml(ldJob({ title: 'X', hiringOrganization: { name: 'Y' } }), 'https://x.com/p?a=2', { requestedUrl: 'https://x.com/p?a=1', finalUrl: 'https://x.com/p?a=2' });
170 assert.equal(r.redirected, true);
171});
172
173
174test('emptyRecord ignores unknown base keys (key parity)', () => {
175 const ok = extractHtml(ldJob({ title: 'X', hiringOrganization: { name: 'Y' } }), 'https://x.com/j');
176 const err = emptyRecord({ requested_url: 'x', whoops_extra: 1 });
177 assert.ok(!('whoops_extra' in err));
178 assert.deepEqual(Object.keys(err).sort(), Object.keys(ok).sort());
179});