1"""
2Unit tests for parser.py — pure functions, no network calls.
3"""
4
5import pytest
6from parser import (
7 gregorian_to_roc,
8 roc_to_gregorian,
9 _clean_number,
10 _translate_header,
11 parse_table,
12 ALL_COLUMNS,
13 BALANCE_COLUMNS,
14 CASHFLOW_COLUMNS,
15 OUTPUT_FIELDS,
16 VALID_STATEMENT_TYPES,
17 API_NAME_MAP,
18 TYPEK_MAP,
19)
20
21
22
23
24
25
26SAMPLE_INCOME_HTML = """
27<html><body>
28<table class="hasBorder">
29<tr>
30 <th>公司代號</th>
31 <th>公司名稱</th>
32 <th>營業收入</th>
33 <th>營業毛利(毛損)</th>
34 <th>本期淨利(淨損)</th>
35 <th>基本每股盈餘(元)</th>
36</tr>
37<tr>
38 <td>2330</td><td>台積電</td><td>2,169,846,000</td><td>1,235,477,000</td><td>878,662,000</td><td>33.86</td>
39</tr>
40<tr>
41 <td>2317</td><td>鴻海</td><td>1,500,000,000</td><td>(50,000,000)</td><td>80,000,000</td><td>5.50</td>
42</tr>
43<tr>
44 <td>說明</td><td>合計</td><td>--</td><td>--</td><td>--</td><td>--</td>
45</tr>
46</table>
47</body></html>
48"""
49
50SAMPLE_MULTI_TABLE_HTML = """
51<html><body>
52<table class="hasBorder">
53<tr><th>公司代號</th><th>公司名稱</th><th>利息淨收益</th><th>本期淨利(淨損)</th><th>基本每股盈餘(元)</th></tr>
54<tr><td>2801</td><td>彰銀</td><td>5,000,000</td><td>11,000,000</td><td>1.00</td></tr>
55</table>
56<table class="hasBorder">
57<tr><th>公司代號</th><th>公司名稱</th><th>營業收入</th><th>本期淨利(淨損)</th><th>基本每股盈餘(元)</th></tr>
58<tr><td>2330</td><td>台積電</td><td>2,000,000,000</td><td>800,000,000</td><td>30.80</td></tr>
59<tr><td>2317</td><td>鴻海</td><td>1,500,000,000</td><td>80,000,000</td><td>5.50</td></tr>
60</table>
61</body></html>
62"""
63
64SAMPLE_NO_TABLE_HTML = "<html><body><p>No data available</p></body></html>"
65
66
67
68
69
70
71class TestYearConversion:
72 def test_2024_to_113(self):
73 assert gregorian_to_roc(2024) == 113
74
75 def test_2025_to_114(self):
76 assert gregorian_to_roc(2025) == 114
77
78 def test_1912_to_1(self):
79 assert gregorian_to_roc(1912) == 1
80
81 def test_pre_roc_raises(self):
82 with pytest.raises(ValueError):
83 gregorian_to_roc(1911)
84
85 def test_roc_to_gregorian_113(self):
86 assert roc_to_gregorian(113) == 2024
87
88 def test_roundtrip(self):
89 assert roc_to_gregorian(gregorian_to_roc(2024)) == 2024
90
91
92
93
94
95
96class TestCleanNumber:
97 def test_plain_integer(self):
98 assert _clean_number("1234567") == 1234567.0
99
100 def test_comma_separated(self):
101 assert _clean_number("1,234,567") == 1234567.0
102
103 def test_negative_parentheses(self):
104 assert _clean_number("(56,789)") == -56789.0
105
106 def test_double_dash_is_none(self):
107 assert _clean_number("--") is None
108
109 def test_empty_string_is_none(self):
110 assert _clean_number("") is None
111
112 def test_whitespace_only_is_none(self):
113 assert _clean_number(" ") is None
114
115 def test_decimal_value(self):
116 assert _clean_number("33.86") == pytest.approx(33.86)
117
118 def test_negative_decimal(self):
119 assert _clean_number("(3.75)") == pytest.approx(-3.75)
120
121 def test_na_is_none(self):
122 assert _clean_number("N/A") is None
123
124 def test_full_width_dash_is_none(self):
125 assert _clean_number("-") is None
126
127 def test_zero_string(self):
128 assert _clean_number("0") == 0.0
129
130
131
132
133
134
135class TestTranslateHeader:
136 def test_known_chinese_header_stock_id(self):
137 assert _translate_header("公司代號") == "stock_id"
138
139 def test_known_chinese_header_revenue(self):
140 assert _translate_header("營業收入") == "revenue"
141
142 def test_known_chinese_header_net_income(self):
143 assert _translate_header("本期淨利(淨損)") == "net_income"
144
145 def test_bank_column_translated(self):
146 assert _translate_header("利息淨收益") == "interest_net_income"
147
148 def test_unknown_header_gets_fallback(self):
149 result = _translate_header("未知欄位")
150 assert result.startswith("col_")
151
152 def test_strips_whitespace(self):
153 assert _translate_header(" 公司名稱 ") == "company_name"
154
155 def test_eps_basic(self):
156 assert _translate_header("基本每股盈餘(元)") == "eps_basic"
157
158
159
160
161
162
163class TestParseTable:
164 def test_parses_two_data_rows(self):
165 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
166 assert len(rows) == 2
167
168 def test_stock_ids_correct(self):
169 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
170 assert rows[0]["stock_id"] == "2330"
171 assert rows[1]["stock_id"] == "2317"
172
173 def test_revenue_parsed_as_float(self):
174 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
175 assert rows[0]["revenue"] == pytest.approx(2169846000.0)
176
177 def test_negative_value_parsed_correctly(self):
178 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
179 assert rows[1]["gross_profit"] == pytest.approx(-50000000.0)
180
181 def test_eps_parsed_as_float(self):
182 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
183 assert rows[0]["eps_basic"] == pytest.approx(33.86)
184
185 def test_period_field_set(self):
186 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
187 assert rows[0]["period"] == "2024Q4"
188
189 def test_year_and_season_fields(self):
190 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
191 assert rows[0]["year"] == 2024
192 assert rows[0]["season"] == 4
193
194 def test_exchange_field(self):
195 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
196 assert rows[0]["exchange"] == "listed"
197
198 def test_summary_row_skipped(self):
199 rows = parse_table(SAMPLE_INCOME_HTML, "income", 2024, 4, "listed")
200 ids = [r["stock_id"] for r in rows]
201 assert "說明" not in ids
202
203 def test_no_table_returns_empty(self):
204 rows = parse_table(SAMPLE_NO_TABLE_HTML, "income", 2024, 4, "listed")
205 assert rows == []
206
207 def test_unknown_statement_type_raises(self):
208 with pytest.raises(ValueError, match="Unknown statement_type"):
209 parse_table(SAMPLE_INCOME_HTML, "banana", 2024, 4, "listed")
210
211 def test_all_valid_statement_types_accepted(self):
212 for stype in VALID_STATEMENT_TYPES:
213 parse_table(SAMPLE_INCOME_HTML, stype, 2024, 4, "listed")
214
215
216
217
218
219
220class TestParseMultiTable:
221 def test_parses_all_tables(self):
222 rows = parse_table(SAMPLE_MULTI_TABLE_HTML, "income", 2024, 3, "listed")
223 assert len(rows) == 3
224
225 def test_bank_row_has_net_income(self):
226 rows = parse_table(SAMPLE_MULTI_TABLE_HTML, "income", 2024, 3, "listed")
227 bank = next(r for r in rows if r["stock_id"] == "2801")
228 assert bank["net_income"] == pytest.approx(11000000.0)
229
230 def test_bank_row_revenue_is_none(self):
231
232 rows = parse_table(SAMPLE_MULTI_TABLE_HTML, "income", 2024, 3, "listed")
233 bank = next(r for r in rows if r["stock_id"] == "2801")
234 assert bank["revenue"] is None
235
236 def test_manufacturer_has_revenue(self):
237 rows = parse_table(SAMPLE_MULTI_TABLE_HTML, "income", 2024, 3, "listed")
238 tsmc = next(r for r in rows if r["stock_id"] == "2330")
239 assert tsmc["revenue"] == pytest.approx(2000000000.0)
240
241 def test_deduplication_keeps_first(self):
242
243 dup_html = SAMPLE_MULTI_TABLE_HTML.replace("2317", "2801")
244 rows = parse_table(dup_html, "income", 2024, 3, "listed")
245 ids = [r["stock_id"] for r in rows]
246 assert ids.count("2801") == 1
247
248 def test_all_rows_have_output_fields(self):
249 from parser import OUTPUT_FIELDS
250 rows = parse_table(SAMPLE_MULTI_TABLE_HTML, "income", 2024, 3, "listed")
251 for row in rows:
252 for field in OUTPUT_FIELDS:
253 assert field in row, f"Missing field '{field}' in row {row['stock_id']}"
254
255
256
257
258
259
260class TestConstants:
261 def test_all_statement_types_have_api_name(self):
262 for stype in VALID_STATEMENT_TYPES:
263 assert stype in API_NAME_MAP
264
265 def test_all_api_names_start_with_ajax(self):
266 for name in API_NAME_MAP.values():
267 assert name.startswith("ajax_")
268
269 def test_typek_map_has_listed_and_otc(self):
270 assert "listed" in TYPEK_MAP
271 assert "otc" in TYPEK_MAP
272
273 def test_typek_values(self):
274 assert TYPEK_MAP["listed"] == "sii"
275 assert TYPEK_MAP["otc"] == "otc"
276
277 def test_all_columns_has_stock_id(self):
278 assert "公司代號" in ALL_COLUMNS
279 assert ALL_COLUMNS["公司代號"] == "stock_id"
280
281 def test_valid_statement_types_count(self):
282 assert len(VALID_STATEMENT_TYPES) == 3
283 assert set(VALID_STATEMENT_TYPES) == {"income", "balance", "cashflow"}
284
285 def test_balance_columns_in_all_columns(self):
286 for zh, en in BALANCE_COLUMNS.items():
287 assert zh in ALL_COLUMNS, f"{zh!r} missing from ALL_COLUMNS"
288 assert ALL_COLUMNS[zh] == en
289
290 def test_cashflow_columns_in_all_columns(self):
291 for zh, en in CASHFLOW_COLUMNS.items():
292 assert zh in ALL_COLUMNS, f"{zh!r} missing from ALL_COLUMNS"
293 assert ALL_COLUMNS[zh] == en
294
295 def test_output_fields_contains_balance_fields(self):
296 for field in ("current_assets", "non_current_assets", "total_assets",
297 "current_liabilities", "non_current_liabilities",
298 "total_liabilities", "total_equity"):
299 assert field in OUTPUT_FIELDS, f"{field!r} missing from OUTPUT_FIELDS"
300
301 def test_output_fields_contains_cashflow_fields(self):
302 for field in ("operating_cash_flow", "investing_cash_flow",
303 "financing_cash_flow", "net_cash_change", "ending_cash"):
304 assert field in OUTPUT_FIELDS, f"{field!r} missing from OUTPUT_FIELDS"
305
306 def test_total_assets_variants_map_to_same_key(self):
307 assert ALL_COLUMNS["資產總計"] == "total_assets"
308 assert ALL_COLUMNS["資產總額"] == "total_assets"
309
310 def test_total_liabilities_variants_map_to_same_key(self):
311 assert ALL_COLUMNS["負債總計"] == "total_liabilities"
312 assert ALL_COLUMNS["負債總額"] == "total_liabilities"
313
314 def test_total_equity_variants_map_to_same_key(self):
315 assert ALL_COLUMNS["權益總計"] == "total_equity"
316 assert ALL_COLUMNS["權益總額"] == "total_equity"
317
318
319
320
321
322
323SAMPLE_BALANCE_HTML = """
324<html><body>
325<table class="hasBorder">
326<tr>
327 <th>公司代號</th>
328 <th>公司名稱</th>
329 <th>流動資產</th>
330 <th>非流動資產</th>
331 <th>資產總計</th>
332 <th>流動負債</th>
333 <th>非流動負債</th>
334 <th>負債總計</th>
335 <th>權益總計</th>
336</tr>
337<tr>
338 <td>2330</td><td>台積電</td>
339 <td>1,500,000,000</td><td>3,000,000,000</td><td>4,500,000,000</td>
340 <td>500,000,000</td><td>200,000,000</td><td>700,000,000</td><td>3,800,000,000</td>
341</tr>
342<tr>
343 <td>2317</td><td>鴻海</td>
344 <td>800,000,000</td><td>1,200,000,000</td><td>2,000,000,000</td>
345 <td>300,000,000</td><td>100,000,000</td><td>400,000,000</td><td>1,600,000,000</td>
346</tr>
347</table>
348</body></html>
349"""
350
351
352SAMPLE_BALANCE_BANK_HTML = """
353<html><body>
354<table class="hasBorder">
355<tr>
356 <th>公司代號</th>
357 <th>公司名稱</th>
358 <th>資產總額</th>
359 <th>負債總額</th>
360 <th>權益總額</th>
361</tr>
362<tr>
363 <td>2801</td><td>彰銀</td>
364 <td>3,000,000,000</td><td>2,700,000,000</td><td>300,000,000</td>
365</tr>
366</table>
367</body></html>
368"""
369
370SAMPLE_CASHFLOW_HTML = """
371<html><body>
372<table class="hasBorder">
373<tr>
374 <th>公司代號</th>
375 <th>公司名稱</th>
376 <th>營業活動之淨現金流入(流出)</th>
377 <th>投資活動之淨現金流入(流出)</th>
378 <th>籌資活動之淨現金流入(流出)</th>
379 <th>本期現金及約當現金增加(減少)數</th>
380 <th>期末現金及約當現金餘額</th>
381</tr>
382<tr>
383 <td>2330</td><td>台積電</td>
384 <td>1,000,000,000</td><td>(500,000,000)</td><td>(200,000,000)</td>
385 <td>300,000,000</td><td>400,000,000</td>
386</tr>
387</table>
388</body></html>
389"""
390
391
392
393
394
395
396class TestParseBalanceSheet:
397 def test_parses_two_rows(self):
398 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
399 assert len(rows) == 2
400
401 def test_total_assets_general_mfg(self):
402 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
403 tsmc = next(r for r in rows if r["stock_id"] == "2330")
404 assert tsmc["total_assets"] == pytest.approx(4_500_000_000.0)
405
406 def test_current_assets(self):
407 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
408 tsmc = next(r for r in rows if r["stock_id"] == "2330")
409 assert tsmc["current_assets"] == pytest.approx(1_500_000_000.0)
410
411 def test_non_current_assets(self):
412 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
413 tsmc = next(r for r in rows if r["stock_id"] == "2330")
414 assert tsmc["non_current_assets"] == pytest.approx(3_000_000_000.0)
415
416 def test_total_liabilities(self):
417 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
418 tsmc = next(r for r in rows if r["stock_id"] == "2330")
419 assert tsmc["total_liabilities"] == pytest.approx(700_000_000.0)
420
421 def test_total_equity(self):
422 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
423 tsmc = next(r for r in rows if r["stock_id"] == "2330")
424 assert tsmc["total_equity"] == pytest.approx(3_800_000_000.0)
425
426 def test_income_fields_are_none_for_balance(self):
427 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
428 tsmc = next(r for r in rows if r["stock_id"] == "2330")
429 assert tsmc["revenue"] is None
430 assert tsmc["net_income"] is None
431 assert tsmc["eps_basic"] is None
432
433 def test_bank_total_assets_variant(self):
434 rows = parse_table(SAMPLE_BALANCE_BANK_HTML, "balance", 2024, 3, "listed")
435 bank = next(r for r in rows if r["stock_id"] == "2801")
436 assert bank["total_assets"] == pytest.approx(3_000_000_000.0)
437
438 def test_bank_total_liabilities_variant(self):
439 rows = parse_table(SAMPLE_BALANCE_BANK_HTML, "balance", 2024, 3, "listed")
440 bank = next(r for r in rows if r["stock_id"] == "2801")
441 assert bank["total_liabilities"] == pytest.approx(2_700_000_000.0)
442
443 def test_bank_total_equity_variant(self):
444 rows = parse_table(SAMPLE_BALANCE_BANK_HTML, "balance", 2024, 3, "listed")
445 bank = next(r for r in rows if r["stock_id"] == "2801")
446 assert bank["total_equity"] == pytest.approx(300_000_000.0)
447
448 def test_all_output_fields_present(self):
449 rows = parse_table(SAMPLE_BALANCE_HTML, "balance", 2024, 3, "listed")
450 for row in rows:
451 for field in OUTPUT_FIELDS:
452 assert field in row, f"Field {field!r} missing from balance row"
453
454
455
456
457
458
459class TestParseCashFlow:
460 def test_parses_one_row(self):
461 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
462 assert len(rows) == 1
463
464 def test_operating_cash_flow(self):
465 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
466 assert rows[0]["operating_cash_flow"] == pytest.approx(1_000_000_000.0)
467
468 def test_investing_cash_flow_negative(self):
469 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
470 assert rows[0]["investing_cash_flow"] == pytest.approx(-500_000_000.0)
471
472 def test_financing_cash_flow_negative(self):
473 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
474 assert rows[0]["financing_cash_flow"] == pytest.approx(-200_000_000.0)
475
476 def test_net_cash_change(self):
477 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
478 assert rows[0]["net_cash_change"] == pytest.approx(300_000_000.0)
479
480 def test_ending_cash(self):
481 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
482 assert rows[0]["ending_cash"] == pytest.approx(400_000_000.0)
483
484 def test_income_fields_are_none_for_cashflow(self):
485 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
486 assert rows[0]["revenue"] is None
487 assert rows[0]["gross_profit"] is None
488
489 def test_balance_fields_are_none_for_cashflow(self):
490 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
491 assert rows[0]["total_assets"] is None
492 assert rows[0]["total_equity"] is None
493
494 def test_all_output_fields_present(self):
495 rows = parse_table(SAMPLE_CASHFLOW_HTML, "cashflow", 2024, 3, "listed")
496 for row in rows:
497 for field in OUTPUT_FIELDS:
498 assert field in row, f"Field {field!r} missing from cashflow row"