From 5efb2c25ea5b9773b4019267baaf0e819ba46094 Mon Sep 17 00:00:00 2001
From: Malin <malin@tfb.local>
Date: Sun, 22 Mar 2026 21:01:03 +0100
Subject: [PATCH] feat: add rows_processed/rows_skipped diagnostics to health +
 refresh endpoints

Helps diagnose whether the product cap is from EAN filtering or a downstream limit.
health and refresh now return: product_count, rows_processed, rows_skipped.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 main.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)
diff --git a/main.py b/main.py
index 3bbb1eb..f49738b 100644
--- a/main.py
+++ b/main.py
@@ -39,8 +39,12 @@ def safe_float(value, default):
         return float(str(value).strip())
     except (ValueError, TypeError):
         return default
+
+
 products_cache = []
 last_refresh = None
+rows_processed = 0
+rows_skipped = 0
 cache_lock = threading.Lock()
 api_key_header = APIKeyHeader(name="X-API-Key")
 
@@ -52,18 +56,19 @@ def verify_key(key: str = Security(api_key_header)):
 
 
 def download_and_parse():
-    global products_cache, last_refresh
+    global products_cache, last_refresh, rows_processed, rows_skipped
     resp = requests.get(EXCEL_URL, timeout=60)
     resp.raise_for_status()
-    # read_only=True would stop at the sheet's declared dimension attribute, silently
-    # missing any rows MTZ added beyond the original range. Since the file is already
-    # in memory (BytesIO), read_only gives no I/O benefit and data_only=True suffices.
+    # read_only=True would stop at the sheet's declared <dimension ref> attribute,
+    # silently missing any rows added beyond the original range.
     wb = load_workbook(BytesIO(resp.content), data_only=True)
     ws = wb.active
     rows = list(ws.iter_rows(min_row=6, values_only=True))
     parsed = []
+    skipped = 0
     for row in rows:
-        if row[1] is None:  # col B (index 1) = item_code
+        if row[1] is None:  # col B (index 1) = item_code — empty row
+            skipped += 1
             continue
         ean_raw = row[3]  # col D
         if ean_raw is None:
@@ -75,6 +80,7 @@ def download_and_parse():
 
         # Skip products with blank or non-numeric EAN codes
         if not ean or not ean.isdigit():
+            skipped += 1
             continue
 
         brand_raw = row[9]  # col J
@@ -93,6 +99,8 @@ def download_and_parse():
         )
     with cache_lock:
         products_cache = parsed
+        rows_processed = len(rows)
+        rows_skipped = skipped
         last_refresh = time.time()
     wb.close()
 
@@ -115,11 +123,14 @@ def startup():
 
 @app.get("/api/health")
 def health():
-    return {
-        "status": "ok",
-        "product_count": len(products_cache),
-        "last_refresh": last_refresh,
-    }
+    with cache_lock:
+        return {
+            "status": "ok",
+            "product_count": len(products_cache),
+            "rows_processed": rows_processed,
+            "rows_skipped": rows_skipped,
+            "last_refresh": last_refresh,
+        }
 
 
 @app.get("/api/products", dependencies=[Depends(verify_key)])
@@ -131,4 +142,10 @@ def get_products():
 @app.post("/api/refresh", dependencies=[Depends(verify_key)])
 def refresh():
     download_and_parse()
-    return {"status": "ok", "product_count": len(products_cache)}
+    with cache_lock:
+        return {
+            "status": "ok",
+            "product_count": len(products_cache),
+            "rows_processed": rows_processed,
+            "rows_skipped": rows_skipped,
+        }