1"""
2Unit tests for parser.py — no network calls, all inputs are fixture HTML strings.
3"""
4
5import pytest
6from src.parser import (
7 parse_report_links,
8 parse_report_page,
9 compute_trends,
10 _extract_report_date,
11 _extract_overall_count,
12 _extract_field_breakdowns,
13 _parse_field_line,
14 _last_day_of_month,
15)
16from bs4 import BeautifulSoup
17
18
19
20
21
22
23LISTING_PAGE_HTML = """
24<html><body>
25 <article>
26 <h2><a href="/en/news/2025-oct-cumulative-number-of-employment-gold-card-issuance/">OCT 2025</a></h2>
27 <h2><a href="/en/news/2025-sep-cumulative-number-of-employment-gold-card-issuance/">SEP 2025</a></h2>
28 <h2><a href="/en/news/2025-aug-cumulative-number-of-employment-gold-card-issuance/">AUG 2025</a></h2>
29 <h2><a href="/en/news/2024%E5%B9%B412%E6%9C%88%E5%B0%B1%E6%A5%AD%E9%87%91%E5%8D%A1%E6%9C%80%E6%96%B0%E6%A0%B8%E7%99%BC%E6%95%B8%E5%AD%97/">DEC 2024</a></h2>
30 <a href="/en/about/">About (should be ignored)</a>
31 <a href="/en/news/some-other-news/">Other news (ignored)</a>
32 </article>
33</body></html>
34"""
35
36REPORT_PAGE_OCT_2025 = """
37<html><body>
38<article>
39 <h1>2025 OCT Cumulative Number of Employment Gold Card Issuance</h1>
40 <p>Currently Valid Cards: 8,129 (as of October 31, 2025)</p>
41 <p>Total Approved Since Program Launch: 14,907 (February 8, 2018 to October 31, 2025)</p>
42
43 <h3>Valid Cards by Field</h3>
44 <ul>
45 <li>Science & Technology: 1,407</li>
46 <li>Economy: 3,038</li>
47 <li>Education: 1,676</li>
48 <li>Culture and Arts: 421</li>
49 <li>Sport: 22</li>
50 <li>Finance: 398</li>
51 <li>Law: 12</li>
52 <li>Architectural Design: 37</li>
53 <li>National Defense: 23</li>
54 <li>Cross-field: 157</li>
55 <li>Digital: 938</li>
56 </ul>
57
58 <h3>Total Approved by Field</h3>
59 <ul>
60 <li>Science & Technology: 2,769</li>
61 <li>Economy: 6,528</li>
62 <li>Education: 2,417</li>
63 <li>Culture and Arts: 889</li>
64 <li>Sport: 25</li>
65 <li>Finance: 898</li>
66 <li>Law: 23</li>
67 <li>Architectural Design: 74</li>
68 <li>National Defense: 31</li>
69 <li>Cross-field: 260</li>
70 <li>Digital: 993</li>
71 </ul>
72</article>
73</body></html>
74"""
75
76REPORT_PAGE_DEC_2024 = """
77<html><body>
78<article>
79 <h1>2024年12月就業金卡最新核發數字</h1>
80 <p>Valid Cards (as of December 31, 2024): 7,545 cards</p>
81 <p>Total Approved (February 8, 2018 - December 31, 2024): 12,082 cards</p>
82
83 <h3>Currently Valid</h3>
84 <ul>
85 <li>Economy: 2,913</li>
86 <li>Science & Technology: 1,426</li>
87 <li>Education: 1,384</li>
88 <li>Digital: 690</li>
89 <li>Finance: 418</li>
90 <li>Culture and Arts: 421</li>
91 <li>Cross-field: 207</li>
92 <li>Architectural Design: 39</li>
93 <li>National Defense: 19</li>
94 <li>Law: 13</li>
95 <li>Sport: 15</li>
96 </ul>
97
98 <h3>Total Approved</h3>
99 <ul>
100 <li>Economy: 5,493</li>
101 <li>Science & Technology: 2,230</li>
102 <li>Education: 1,794</li>
103 <li>Culture and Arts: 758</li>
104 <li>Finance: 768</li>
105 <li>Digital: 705</li>
106 <li>Cross-field: 219</li>
107 <li>Architectural Design: 59</li>
108 <li>National Defense: 20</li>
109 <li>Law: 19</li>
110 <li>Sport: 17</li>
111 </ul>
112</article>
113</body></html>
114"""
115
116
117REPORT_PAGE_MINIMAL = """
118<html><body>
119<article>
120 <p>Currently Valid Cards: 100</p>
121 <p>Total Approved Since Program Launch: 200</p>
122 <ul>
123 <li>Economy: 60</li>
124 <li>Digital: 40</li>
125 </ul>
126</article>
127</body></html>
128"""
129
130
131REPORT_PAGE_ALT_FORMAT = """
132<html><body>
133<article>
134 <p>8,129 cards are currently valid as of October 31, 2025.</p>
135 <p>14,907 cards have been approved since the program launched.</p>
136 <ul>
137 <li>Economy: 3,038</li>
138 <li>Science & Technology: 1,407</li>
139 <li>Education: 1,676</li>
140 <li>Finance: 398</li>
141 <li>Digital: 938</li>
142 </ul>
143</article>
144</body></html>
145"""
146
147
148
149
150
151
152class TestParseReportLinks:
153 def test_extracts_english_cumulative_links(self):
154 links = parse_report_links(LISTING_PAGE_HTML)
155 assert any("2025-oct-cumulative" in link for link in links)
156 assert any("2025-sep-cumulative" in link for link in links)
157 assert any("2025-aug-cumulative" in link for link in links)
158
159 def test_extracts_chinese_encoded_links(self):
160 links = parse_report_links(LISTING_PAGE_HTML)
161
162 assert any("2024" in link for link in links)
163
164 def test_ignores_non_report_links(self):
165 links = parse_report_links(LISTING_PAGE_HTML)
166 assert not any("/en/about/" in link for link in links)
167 assert not any("some-other-news" in link for link in links)
168
169 def test_returns_absolute_urls(self):
170 links = parse_report_links(LISTING_PAGE_HTML)
171 for link in links:
172 assert link.startswith("https://goldcard.nat.gov.tw"), f"Non-absolute URL: {link}"
173
174 def test_no_duplicates(self):
175 links = parse_report_links(LISTING_PAGE_HTML)
176 assert len(links) == len(set(links))
177
178 def test_empty_page_returns_empty_list(self):
179 assert parse_report_links("<html><body></body></html>") == []
180
181
182
183
184
185
186class TestParseReportPageOct2025:
187 URL = "https://goldcard.nat.gov.tw/en/news/2025-oct-cumulative-number-of-employment-gold-card-issuance/"
188
189 @pytest.fixture
190 def result(self):
191 return parse_report_page(REPORT_PAGE_OCT_2025, self.URL)
192
193 def test_report_date(self, result):
194 assert result["report_date"] == "2025-10-31"
195
196 def test_report_url(self, result):
197 assert result["report_url"] == self.URL
198
199 def test_total_approved(self, result):
200 assert result["total_approved"] == 14907
201
202 def test_currently_valid(self, result):
203 assert result["currently_valid"] == 8129
204
205 def test_valid_by_field_count(self, result):
206 assert len(result["valid_by_field"]) == 11
207
208 def test_valid_economy(self, result):
209 assert result["valid_by_field"]["Economy"] == 3038
210
211 def test_valid_science_tech(self, result):
212 assert result["valid_by_field"]["Science & Technology"] == 1407
213
214 def test_valid_finance(self, result):
215 assert result["valid_by_field"]["Finance"] == 398
216
217 def test_valid_digital(self, result):
218 assert result["valid_by_field"]["Digital"] == 938
219
220 def test_approved_by_field_count(self, result):
221 assert len(result["approved_by_field"]) == 11
222
223 def test_approved_economy(self, result):
224 assert result["approved_by_field"]["Economy"] == 6528
225
226 def test_valid_field_sum_matches_total(self, result):
227 field_sum = sum(result["valid_by_field"].values())
228 assert field_sum == result["currently_valid"]
229
230 def test_approved_field_sum_matches_total(self, result):
231 field_sum = sum(result["approved_by_field"].values())
232 assert field_sum == result["total_approved"]
233
234
235
236
237
238
239class TestParseReportPageDec2024:
240 URL = "https://goldcard.nat.gov.tw/en/news/2024%E5%B9%B412%E6%9C%88%E5%B0%B1%E6%A5%AD%E9%87%91%E5%8D%A1%E6%9C%80%E6%96%B0%E6%A0%B8%E7%99%BC%E6%95%B8%E5%AD%97/"
241
242 @pytest.fixture
243 def result(self):
244 return parse_report_page(REPORT_PAGE_DEC_2024, self.URL)
245
246 def test_report_date(self, result):
247
248 assert result["report_date"] == "2024-12-31"
249
250 def test_total_approved(self, result):
251 assert result["total_approved"] == 12082
252
253 def test_currently_valid(self, result):
254 assert result["currently_valid"] == 7545
255
256 def test_valid_economy(self, result):
257 assert result["valid_by_field"]["Economy"] == 2913
258
259 def test_valid_science_tech(self, result):
260 assert result["valid_by_field"]["Science & Technology"] == 1426
261
262
263
264
265
266
267class TestParseReportPageEdgeCases:
268 def test_minimal_page_returns_partial_data(self):
269 url = "https://goldcard.nat.gov.tw/en/news/2025-jan-cumulative-number-of-employment-gold-card-issuance/"
270 result = parse_report_page(REPORT_PAGE_MINIMAL, url)
271 assert result["total_approved"] == 200
272 assert result["currently_valid"] == 100
273 assert result["report_date"] == "2025-01-31"
274
275 def test_alt_text_format_extracts_counts(self):
276 url = "https://goldcard.nat.gov.tw/en/news/2025-oct-cumulative-number-of-employment-gold-card-issuance/"
277 result = parse_report_page(REPORT_PAGE_ALT_FORMAT, url)
278 assert result["total_approved"] == 14907
279 assert result["currently_valid"] == 8129
280
281 def test_result_always_has_required_keys(self):
282 url = "https://goldcard.nat.gov.tw/en/news/2025-oct-cumulative-number-of-employment-gold-card-issuance/"
283 result = parse_report_page("<html><body><p>No data here</p></body></html>", url)
284 for key in ("report_date", "report_url", "total_approved", "currently_valid",
285 "valid_by_field", "approved_by_field"):
286 assert key in result, f"Missing key: {key}"
287
288 def test_unrecognised_url_slug_returns_none_date(self):
289
290 url = "https://goldcard.nat.gov.tw/en/news/some-unknown-slug/"
291 result = parse_report_page("<html><body><article><p>No data</p></article></body></html>", url)
292 assert result["report_date"] is None
293 assert result["report_url"] == url
294
295 def test_absolute_url_in_listing_not_duplicated(self):
296
297 html = """<html><body>
298 <a href="https://goldcard.nat.gov.tw/en/news/2025-oct-cumulative-number-of-employment-gold-card-issuance/">Oct</a>
299 <a href="/en/news/2025-sep-cumulative-number-of-employment-gold-card-issuance/">Sep</a>
300 </body></html>"""
301 links = parse_report_links(html)
302 assert len(links) == len(set(links)), "Duplicates found"
303 assert all(l.startswith("https://goldcard.nat.gov.tw") for l in links)
304 assert not any("goldcard.nat.gov.twgoldcard" in l for l in links)
305
306
307
308
309
310
311class TestComputeTrends:
312 @pytest.fixture
313 def two_reports(self):
314 return [
315 {
316 "report_date": "2025-10-31",
317 "total_approved": 14907,
318 "currently_valid": 8129,
319 "valid_by_field": {"Economy": 3038, "Digital": 938, "Finance": 398},
320 "approved_by_field": {},
321 },
322 {
323 "report_date": "2024-12-31",
324 "total_approved": 12082,
325 "currently_valid": 7545,
326 "valid_by_field": {"Economy": 2913, "Digital": 690, "Finance": 418},
327 "approved_by_field": {},
328 },
329 ]
330
331 def test_valid_growth_pct(self, two_reports):
332 trends = compute_trends(two_reports)
333 expected = round((8129 - 7545) / 7545 * 100, 2)
334 assert trends["valid_cards_growth_pct"] == expected
335
336 def test_approved_growth_pct(self, two_reports):
337 trends = compute_trends(two_reports)
338 expected = round((14907 - 12082) / 12082 * 100, 2)
339 assert trends["total_approved_growth_pct"] == expected
340
341 def test_monthly_net_new(self, two_reports):
342 trends = compute_trends(two_reports)
343 assert trends["monthly_net_new_valid"] == 8129 - 7545
344
345 def test_fastest_growing_field(self, two_reports):
346 trends = compute_trends(two_reports)
347
348
349 assert trends["fastest_growing_field"] == "Digital"
350
351 def test_single_report_returns_empty(self):
352 report = {"currently_valid": 100, "total_approved": 200, "valid_by_field": {}, "approved_by_field": {}}
353 assert compute_trends([report]) == {}
354
355 def test_empty_list_returns_empty(self):
356 assert compute_trends([]) == {}
357
358 def test_reports_analyzed_count(self, two_reports):
359 trends = compute_trends(two_reports)
360 assert trends["reports_analyzed"] == 2
361
362 def test_zero_growth_when_counts_identical(self):
363 rep = {
364 "report_date": "2025-10-31", "total_approved": 14907, "currently_valid": 8129,
365 "valid_by_field": {"Economy": 3038}, "approved_by_field": {},
366 }
367 trends = compute_trends([rep, rep])
368 assert trends["valid_cards_growth_pct"] == 0.0
369 assert trends["monthly_net_new_valid"] == 0
370
371
372
373
374
375
376class TestHelpers:
377 def test_last_day_jan(self):
378 assert _last_day_of_month(2025, 1) == "2025-01-31"
379
380 def test_last_day_feb_non_leap(self):
381 assert _last_day_of_month(2025, 2) == "2025-02-28"
382
383 def test_last_day_feb_leap(self):
384 assert _last_day_of_month(2024, 2) == "2024-02-29"
385
386 def test_last_day_dec(self):
387 assert _last_day_of_month(2024, 12) == "2024-12-31"
388
389 def test_parse_field_line_colon_format(self):
390 result = _parse_field_line("Economy: 3,038")
391 assert result == ("Economy", 3038)
392
393 def test_parse_field_line_pipe_format(self):
394 result = _parse_field_line("Digital | 938")
395 assert result == ("Digital", 938)
396
397 def test_parse_field_line_science_tech_ampersand(self):
398 result = _parse_field_line("Science & Technology: 1,407")
399 assert result == ("Science & Technology", 1407)
400
401 def test_parse_field_line_no_match(self):
402 assert _parse_field_line("Some random text without a number") is None
403
404 def test_parse_field_line_html_entity_ampersand(self):
405
406 result = _parse_field_line("Science & Technology: 2,769")
407 assert result == ("Science & Technology", 2769)