"""One-shot: build variant groups for the 1003 new products and detect reuse from osszesito.
Outputs:
  _variant_groups.json  - list of groups to process (reuse vs generate)
  _new_progress.json    - progress checkpoint (created if missing)
"""
import openpyxl, json, re, sys, io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

OSS = 'osszesito_leirasok.xlsx'

# --- load osszesito (existing, with descriptions) ---
wb = openpyxl.load_workbook(OSS, read_only=True)
ws = wb['Leírások']
oss = {}  # ck -> {name, cat, short, long}
for row in ws.iter_rows(min_row=2, values_only=True):
    if row[0] is None:
        continue
    ck = str(row[0]).strip()
    oss[ck] = {'name': row[1] or '', 'cat': row[2] or '',
               'short': row[3] or '', 'long': row[4] or ''}
wb.close()
oss_cks = set(oss)

new = json.load(open('_new_products.json', encoding='utf-8'))

# --- helpers ---
SIZE_SUFFIX = re.compile(r'[-/](\d{2})$')  # trailing 2-digit size (ring/bracelet)

def model_prefix(ck):
    """Strip a trailing 2-digit size suffix (ring sizes ~40-70). 3+ digit suffixes
    (watch model numbers like 11020-132) are left intact -> singleton.
    Returns (prefix, size_stripped_bool)."""
    m = SIZE_SUFFIX.search(ck)
    if m and 38 <= int(m.group(1)) <= 75:
        return ck[:m.start()], True
    return ck, False

def norm_name(name, brand, ck):
    """Descriptive part of the name: remove brand, cikkszám, size markers."""
    s = str(name or '')
    if brand:
        s = re.sub(re.escape(str(brand)), '', s, flags=re.I)
    # remove cikkszám and its prefix (before size)
    pref = model_prefix(ck)[0]
    for tok in (ck, pref):
        s = s.replace(tok, '')
    s = re.sub(r'/\d{2}\b', '', s)        # /52 size
    s = re.sub(r'\b\d{2}\b', '', s)        # bare size near end
    s = re.sub(r'[^0-9a-záéíóöőúüű ]', ' ', s, flags=re.I)
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

def is_descriptive(nn):
    return len(re.sub(r'[^a-záéíóöőúüű]', '', nn)) >= 3

# --- build osszesito indexes for reuse lookup ---
# 1) by model-prefix for size-stripped sized items (strongest, name-independent)
# 2) by (prefix, norm_name) for descriptive-name items
oss_by_prefix = {}     # prefix -> best ck (prefers descriptive name)
oss_idx = {}           # (prefix, nn) -> ck
for ck, d in oss.items():
    if not (d['short'] and str(d['short']).strip()):
        continue
    pref, sized = model_prefix(ck)
    nn = norm_name(d['name'], d['name'].split()[0] if d['name'] else '', ck)
    if sized:
        cur = oss_by_prefix.get(pref)
        # prefer a sibling whose name is descriptive
        if cur is None or (is_descriptive(nn) and not is_descriptive(
                norm_name(oss[cur]['name'], '', cur))):
            oss_by_prefix[pref] = ck
    if is_descriptive(nn):
        oss_idx.setdefault((pref, nn), ck)

# --- group new products ---
groups = {}  # gkey -> group dict
for d in new:
    ck = d['ck']
    brand = d['p'].get('Márka', '')
    pref, sized = model_prefix(ck)
    nn = norm_name(d['name'], brand, ck)
    if is_descriptive(nn):
        gkey = ('D', pref, nn)
    elif sized:
        gkey = ('P', pref, '')   # group sized items by model-prefix even w/o name
    else:
        gkey = ('S', ck, '')     # singleton
    g = groups.get(gkey)
    if g is None:
        g = {'key': list(gkey), 'name': d['name'], 'brand': brand,
             'cat': d['cat'], 'cks': [], 'params': d['p']}
        groups[gkey] = g
    g['cks'].append(ck)

# --- detect reuse from osszesito ---
out = []
reuse_n = 0
for gkey, g in groups.items():
    kind, pref, nn = gkey
    src = None
    if kind in ('D', 'P'):
        src = oss_by_prefix.get(pref)   # size variant of an already-described model
    if not src and kind == 'D':
        src = oss_idx.get((pref, nn))
    g['reuse_from'] = src
    if src:
        reuse_n += len(g['cks'])
    out.append(g)

# stable order: generate-needed first, grouped by category
out.sort(key=lambda g: (g['reuse_from'] is not None, str(g['cat']), g['name'] or ''))

json.dump(out, open('_variant_groups.json', 'w', encoding='utf-8'),
          ensure_ascii=False, indent=1)

total_cks = sum(len(g['cks']) for g in out)
gen_groups = [g for g in out if not g['reuse_from']]
gen_cks = sum(len(g['cks']) for g in gen_groups)
print('groups total:', len(out))
print('  reuse-from-osszesito groups:', len(out) - len(gen_groups), '-> cikks:', reuse_n)
print('  need-generate groups:', len(gen_groups), '-> cikks:', gen_cks)
print('  total cikks covered:', total_cks, '(expected 1003)')
# multi-size generate groups (1 gen covers many rows)
multi = [g for g in gen_groups if len(g['cks']) > 1]
print('  multi-size generate groups:', len(multi),
      '(saving', sum(len(g['cks']) for g in multi) - len(multi), 'gens)')

import os
if not os.path.exists('_new_progress.json'):
    json.dump({'done_cks': [], 'done_groups': []},
              open('_new_progress.json', 'w', encoding='utf-8'),
              ensure_ascii=False, indent=1)
    print('created _new_progress.json')
