"""Setup: master -> updated copy, build targets list, init progress checkpoint."""
import openpyxl, shutil, json, re, os, sys, io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

BASE = os.path.dirname(os.path.abspath(__file__))
SRC = os.path.join(BASE, 'merged_master_v2 (1).xlsx')
DST = os.path.join(BASE, 'merged_master_v2_updated.xlsx')
LIST = os.path.join(BASE, 'ujrairo_lista.xlsx')
TARGETS = os.path.join(BASE, '_targets.json')
PROGRESS = os.path.join(BASE, '_progress.json')

# 1. Copy master to updated (only if not already exists, to preserve in-progress work)
if not os.path.exists(DST):
    shutil.copy(SRC, DST)
    print(f'Copied master -> {os.path.basename(DST)}')
else:
    print(f'{os.path.basename(DST)} already exists, keeping it.')

# 2. Read rewrite list cikkszáms
wb_list = openpyxl.load_workbook(LIST, read_only=True)
ws_list = wb_list['Sheet1']
rewrite_set = set()
for row in ws_list.iter_rows(min_row=2, values_only=True):
    if row[0]:
        rewrite_set.add(str(row[0]))
print(f'Rewrite list: {len(rewrite_set)} cikkszáms')

# 3. Scan master, build {cikkszám: {row_index, short_len, long_len, name}} for in-scope
wb = openpyxl.load_workbook(DST, read_only=True)
ws = wb['Sheet1']

in_scope = {}  # cikkszám -> {row_idx (1-based), short_len, long_len, name}
all_rows = {}  # for variant grouping context
for r_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2):
    if row[0] is None:
        continue
    cikk = str(row[0])
    name = row[1] or ''
    short = row[103] or ''
    long_ = row[104] or ''
    sl = len(short); ll = len(long_)
    all_rows[cikk] = {'row': r_idx, 'name': name, 'short_len': sl, 'long_len': ll}
    if cikk in rewrite_set or min(sl, ll) < 500:
        in_scope[cikk] = {'row': r_idx, 'name': name, 'short_len': sl, 'long_len': ll}

print(f'In-scope total: {len(in_scope)}')

# 4. Variant grouping - base = strip trailing -NN (2 digits)
def get_base(cikk):
    m = re.match(r'^(.+?)-(\d{2})$', cikk)
    return m.group(1) if m else cikk

# Group ALL cikkszáms by base (so we can find sibling variants even if not all in scope)
base_to_all = {}
for cikk in all_rows:
    base_to_all.setdefault(get_base(cikk), []).append(cikk)

# For each in-scope cikk, find its base. Build leader -> followers map.
# Leader = lexicographically smallest cikk in the group that is in_scope.
leader_to_followers = {}
seen = set()
for cikk in sorted(in_scope.keys()):
    if cikk in seen:
        continue
    base = get_base(cikk)
    siblings = sorted(base_to_all[base])
    # Filter siblings: only include those in_scope OR all variants of same base for replication
    # Per plan: replicate to ALL variants (including those not flagged), to keep consistency
    if len(siblings) > 1 and base != cikk:  # has variants
        leader = siblings[0]
        followers = siblings  # leader + all
    else:
        leader = cikk
        followers = [cikk]
    leader_to_followers[leader] = followers
    for f in followers:
        seen.add(f)

# Order leaders by row index for deterministic processing
leaders_sorted = sorted(leader_to_followers.keys(),
                        key=lambda c: all_rows[c]['row'])

targets = []
for leader in leaders_sorted:
    followers = leader_to_followers[leader]
    targets.append({
        'leader': leader,
        'leader_row': all_rows[leader]['row'],
        'leader_name': all_rows[leader]['name'],
        'variants': [{'cikk': f, 'row': all_rows[f]['row']} for f in followers]
    })

print(f'Total leaders to process: {len(targets)}')
print(f'Total variant rows that will be updated: {sum(len(t["variants"]) for t in targets)}')

# 5. Write targets.json
with open(TARGETS, 'w', encoding='utf-8') as f:
    json.dump(targets, f, ensure_ascii=False, indent=1)
print(f'Wrote {os.path.basename(TARGETS)}')

# 6. Init progress.json if missing
if not os.path.exists(PROGRESS):
    with open(PROGRESS, 'w', encoding='utf-8') as f:
        json.dump({'done_leaders': [], 'next_index': 0}, f, ensure_ascii=False, indent=1)
    print(f'Initialized {os.path.basename(PROGRESS)}')
else:
    with open(PROGRESS, encoding='utf-8') as f:
        prog = json.load(f)
    print(f'Progress exists: {len(prog["done_leaders"])} done, next_index={prog["next_index"]}')
