
    Fj                       U d Z ddlmZ ddlmZmZ ddlmZmZm	Z	m
Z
 ddlZddlZdZdIdZe G d d                      Ze G d d                      ZdJdZdKdZdLdZdMdZdNdZdOdZdPd"Zeeee         ee         eegee         f         ZdQd$ZdRd&ZdSd)ZdTd,ZdUd.Z dUd/Z!dUd0Z"dUd1Z#dUd2Z$dUd3Z%dUd4Z&dUd5Z'e e!e"e#e$e%e&e'gZ(d6e)d7<   d8Z*d9d9d9d:d;d<Z+dVd>Z,dWd?Z-ddd@dXdEZ.dYdHZ/dS )ZuL  Kanban diagnostics — structured, actionable distress signals for tasks.

A ``Diagnostic`` is a machine-readable description of something that's wrong
with a kanban task: a hallucinated card id, a spawn crash-loop, a task
stuck blocked for too long, etc. Each one carries:

* A **kind** (canonical code; UI/tests match on this).
* A **severity** (``warning`` / ``error`` / ``critical``).
* A **title** (one-line human description) and **detail** (longer text).
* A list of **suggested actions** — structured entries the dashboard
  turns into buttons and the CLI turns into hints.

Rules run over (task, recent events, recent runs) and emit diagnostics.
They are stateless and read-only — no DB writes. Callers compute
diagnostics on demand (on ``/board`` load, ``/tasks/:id`` fetch, or
``hermes kanban diagnostics``).

Design goals:

* Fixable-on-the-operator's-side signals only (missing config, phantom
  ids, crash loop). Not "the provider returned 502 once" — that's a
  transient runtime blip, not a diagnostic.
* Recoverable: every diagnostic comes with at least one suggested
  recovery action the operator can actually take from the UI.
* Auto-clearing: when the underlying failure mode resolves (a clean
  ``completed`` event arrives, a spawn succeeds, the task gets
  unblocked), the diagnostic stops firing. The audit event trail stays.
    )annotations)	dataclassfield)AnyCallableIterableOptionalN)warningerrorcriticalseverityOptional[str]	thresholdreturnboolc                    |dS | t           vs	|t           vrdS t                               |           t                               |          k    S )z=Return True when ``severity`` meets or exceeds ``threshold``.NTF)SEVERITY_ORDERindex)r   r   s     </usr/local/lib/hermes-agent/hermes_cli/kanban_diagnostics.pyseverity_at_or_abover   ,   sN    t~%%.)H)Hu))^-A-A)-L-LLL    c                  `    e Zd ZU dZded<   ded<    ee          Zded<   dZd	ed
<   ddZ	dS )DiagnosticActionu0  A single recovery action attached to a diagnostic.

    The ``kind`` determines how both the UI and CLI render it:

    * ``reclaim`` / ``reassign`` — POST to the matching /tasks/:id/*
      endpoint; dashboard wires into the existing recovery popover.
    * ``unblock`` — PATCH status back to ``ready`` (for stuck-blocked
      diagnostics).
    * ``cli_hint`` — print/copy a shell command (e.g.
      ``hermes -p <profile> auth``). No HTTP side effect.
    * ``open_docs`` — deep-link to the docs URL named in ``payload.url``.
    * ``comment`` — nudge the operator to add a comment (for
      stuck-blocked tasks that need human input).

    ``suggested=True`` marks the action as the recommended first step;
    the UI highlights it. Multiple actions can be suggested if they're
    equally valid.
    strkindlabeldefault_factorydictpayloadFr   	suggestedr   c                8    | j         | j        | j        | j        dS )Nr   r   r    r!   r#   selfs    r   to_dictzDiagnosticAction.to_dictO   s%    IZ|	
 
 	
r   Nr   r   )
__name__
__module____qualname____doc____annotations__r   r   r    r!   r&    r   r   r   r   5   sw          & IIIJJJE$///G////I
 
 
 
 
 
r   r   c                      e Zd ZU dZded<   ded<   ded<   ded<    ee          Zded	<   d
Zded<   d
Z	ded<   dZ
ded<   dZded<    ee          Zded<   ddZdS )
Diagnosticz%One active distress signal on a task.r   r   r   titledetailr   list[DiagnosticAction]actionsr   intfirst_seen_atlast_seen_at   countNOptional[int]run_idr   datar   c                    | j         | j        | j        | j        d | j        D             | j        | j        | j        | j        | j	        d
S )Nc                6    g | ]}|                                 S r-   )r&   ).0as     r   
<listcomp>z&Diagnostic.to_dict.<locals>.<listcomp>o   s     :::		:::r   
r   r   r0   r1   r3   r5   r6   r8   r:   r;   rA   r$   s    r   r&   zDiagnostic.to_dicti   sS    IZk::T\:::!/ -ZkI
 
 	
r   r'   )r(   r)   r*   r+   r,   r   listr3   r5   r6   r8   r:   r   r;   r&   r-   r   r   r/   r/   X   s         //IIIMMMJJJKKK&+eD&A&A&AGAAAAMLENNNN F    t,,,D,,,,
 
 
 
 
 
r   r/   c                   | |S 	 t          | d          r||                                 v r| |         S n# t          $ r Y nw xY wt          | t                    r|                     ||          S t          | ||          S )a'  Read a field from a task regardless of representation.

    Callers pass sqlite3.Row (dict-like with [] but no attribute
    access), kanban_db.Task dataclasses (attribute access), or plain
    dicts (both). This normalises them so rule functions don't have
    to branch on type each time.
    Nkeys)hasattrrD   	Exception
isinstancer   getgetattr)tasknamedefaults      r   _task_fieldrM   |   s     | 4   	TTYY[[%8%8:   $ 'xxg&&&4w'''s   -5 
AAr   c                    t          | dd          }|i S t          |t                    r|S t          |t                    r)	 t	          j        |          pi S # t          $ r i cY S w xY wi S )z<Tolerate event.payload being either a dict or a JSON string.r    N)rM   rG   r   r   jsonloadsrF   )evps     r   _parse_payloadrS      s    B	4((Ay	!T !S 	:a==&B& 	 	 	III	Is   A A('A(r   c                (    t          | dd          pdS )Nr    rM   )rQ   s    r   _event_kindrW      s    r62&&,",r   r4   c                F    t          | dd          }t          |pd          S )N
created_atr   )rM   r4   )rQ   ts     r   	_event_tsr[      s#    Ba((AqvA;;r   eventsIterable[Any]r   	list[Any]c                    g }| D ]E}t          |          }|dv r|                                 *||k    r|                    |           F|S )a[  Return events of ``kind`` that have no ``completed``/``edited``
    event *strictly after* them. Walks chronologically: each clean
    event resets the accumulator; each matching event gets appended.

    Events must be sorted by id (i.e. arrival order); callers pass the
    task's full event list which the DB already returns in that order.
    >   edited	completed)rW   clearappend)r\   r   activerQ   ks        r   _active_hallucination_eventsrf      s_     F  OO'''LLNNNN$YYMM"Mr   c                t    d}| D ]2}t          |          dv rt          |          }t          ||          }3|S )zTimestamp of the most recent clean completion / edit event.

    Kept for general "has this task ever been successfully completed"
    lookups; hallucination rules use ``_active_hallucination_events``
    instead because they need strict ordering.
    r   >   r`   ra   )rW   r[   max)r\   latestrQ   rZ   s       r   _latest_clean_event_tsrj      sH     F $ $r??555"A^^FMr   rJ   r   runningr2   c                   g }|r%|                     t          ddi                      |                     t          ddd|i                     |S )NreclaimzReclaim taskr   r   r    reassignzReassign to different profilereclaim_first)rc   r   )rJ   rk   outs      r   _generic_recovery_actionsrr      s    "$C 

# 
 
 
 	 	 	
 JJ- '*     
 Jr   slotc                T   t          | t                    sdS t          |                     d          pd                                                                          }|r|dk    rdS dD ];}t          |                     |          pd                                          r dS <dS )uc  Return True if the auxiliary slot has user-supplied non-default fields.

    Defaults from ``DEFAULT_CONFIG`` use ``provider: "auto"`` with empty
    model/base_url/api_key — that path falls through to the main model. An
    "explicit" config is one where the user actively set a provider (not
    "auto"), or supplied a model / base_url / api_key.
    FproviderrU   autoT)modelbase_urlapi_key)rG   r   r   rH   striplower)rs   ru   keys      r   _aux_slot_explicitr}      s     dD!! u488J''-2..4466<<>>H H&&t/  txx}}"##))++ 	44	5r   
raw_configc                2   t          | t                    sdS |                     d          }t          |t                    rt          |                    d          pd                                          }t          |                    d          p+|                    d          p|                    d          pd                                          }t          |o|          S t          t          |pd                                                    S )a  Best-effort check that a main model is configured.

    Diagnostics runs in the dashboard process which may not share the CLI's
    runtime state, so we read the raw config dict. If we cannot prove the
    main model is set, we err on the side of NOT firing the diagnostic.
    Frw   ru   rU   rL   rK   )rG   r   rH   r   rz   r   )r~   	model_cfgru   rw   s       r   _main_model_visibler     s    j$'' uw''I)T"" (y}}Z006B77==??MM)$$ }}W%%}}V$$ 	
 

 %'' 	 H&'''IO$$**,,---r   configOptional[dict]c                   t          | t                    sdS |                     d          }t          |t                    r|S |                     d          }t          |                     d          t                    r|                     d          ni }t          |t                    s|sd| vrdS d}d}t          |t                    rDt          |                    d                    }t          |                    d                    }d	}t          |t                    r&d
|v r"t	          |                    d
                    }|||t          |           dS )u)  Inspect raw config and report whether triage paths look configured.

    Returns ``None`` when config context is unavailable (suppress diagnostic
    to avoid noisy false positives in tests / low-level callers). Otherwise
    returns a dict with:

      - ``auto_decompose``: bool — whether the dispatcher auto-runs decompose
      - ``decomposer_explicit``: bool — user-supplied decomposer slot
      - ``specifier_explicit``: bool — user-supplied specifier slot
      - ``main_model_visible``: bool — main model can serve as auto fallback
    Ntriage_aux_status	auxiliarykanbanrw   Fkanban_decomposertriage_specifierTauto_decompose)r   decomposer_explicitspecifier_explicitmain_model_visible)rG   r   rH   r}   r   r   )r   explicitaux
kanban_cfgr   r   r   s          r   r   r     sn    fd## tzz-..H(D!! 
**[
!
!C)3FJJx4H4H$)O)OWH%%%UWJ sD!! 6!!t#t M09L1M1MNN/8J0K0KLL N*d## @(8J(F(Fjnn-=>>?? )201&99	  r   valuerL   c                j    	 t          |           }n# t          t          f$ r |cY S w xY w|dk    r|n|S )Nr7   )r4   	TypeError
ValueError)r   rL   parseds      r   _positive_intr   N  sO    Uz"   q[[66g-s    ((list[Diagnostic]c                    t          |d          }|sg S g }t          |d                   }t          |d                   }|D ]E}	t          |	          }
|
                    dg           pg D ]}||vr|                    |           Ft          | d          dk    }g }|                    t          ddd	
                     |                    t          | |                     t          dddd|||t          |          d|i	  	        gS )aI  Blocked-hallucination gate fires: a worker called kanban_complete
    with created_cards that didn't exist or weren't created by the
    completing profile. Task stayed in its prior state; the operator
    needs to decide how to proceed.

    Auto-clears when a successful completion (or edit) follows the
    blocked event.
     completion_blocked_hallucinationr   phantom_cardsstatusrk   commentz#Add a comment explaining what to doFr   r   r!   rk   hallucinated_cardsr   z%Worker claimed cards that don't exista  The completing worker declared created_cards that either didn't exist or weren't created by its profile. The completion was blocked and the task stayed in its prior state. Usually means the worker hallucinated ids instead of capturing return values from kanban_create.phantom_ids	r   r   r0   r1   r3   r5   r6   r8   r;   )rf   r[   rS   rH   rc   rM   r   extendrr   r/   len)rJ   r\   runsnowcfghitsr   firstlastrQ   r    pidrk   r3   s                 r   _rule_hallucinated_cardsr   V  s_    (0RSSD 	Kd1gET"XD ( ( $$;;339r 	( 	(C+%%""3'''	( $))Y6G&(GNN#3     
 NN,T7CCCDDD!51 $ii[)    r   c                   t          | d          dk    rg S t          |          }|g S t          |                    d                    }t          |                    d                    }t          |                    d                    }t          |                    d                    }	|rd}
|}d	}|}d
}d}nd	}
|}d}|}d}d}|s|	rg S t          | d          pd}t	          dd|
 dd|
 did          g}|s0|	s.|                    t	          dd| dd| di                     |s-|                    t	          dd| dd| i                     t          ddd| dd| d |||d!|||
|	d"#	  	        gS )$u  A triage task cannot leave triage without an auxiliary helper.

    With the auto-decompose dispatcher (kanban.auto_decompose, default True),
    triage tasks fan out via ``auxiliary.kanban_decomposer`` and fall back to
    ``auxiliary.triage_specifier`` when the decomposer returns ``fanout=false``.
    With auto-decompose off, the user must run ``hermes kanban specify``,
    which only needs ``auxiliary.triage_specifier``.

    The default slot is ``provider: auto`` → auto-falls back to the main model,
    so this rule only fires when:

      - the relevant slot is explicitly set to something broken, OR
      - the auto fallback has no main model to fall back to.

    Config context is required; pass {} from tests to keep the rule silent.
    r   triageNr   r   r   r   zauxiliary.kanban_decomposerzauxiliary.triage_specifier
decomposerzAuto-decompose is on, so the dispatcher needs auxiliary.kanban_decomposer (with auxiliary.triage_specifier as a fallback for non-fan-out tasks).	specifierzkAuto-decompose is off, so triage tasks need `hermes kanban specify`, which uses auxiliary.triage_specifier.idz	<task_id>cli_hintz
Configure commandzhermes config set z.provider autoTr#   zOr configure fallback rn   z(Specify manually: hermes kanban specify zhermes kanban specify triage_aux_unavailabler
   zTriage z has no usable modelzZThis task is still in triage and no working auxiliary model is visible to the dispatcher. z The default slot uses `provider: auto` which falls back to the main model, but no main model is configured either. Configure the slot directly or set a main model so the auto fallback can take over.r7   )task_idr   primary_slotr   r   )rM   r   r   rH   r   rc   r/   )rJ   r\   r   r   r   r   r   r   r   main_visibler   primary_explicitfallback_slotfallback_explicitprimary_descdetail_pathr   r3   s                     r   _rule_triage_aux_unavailabler     s   " 4""h..	s##F~	&**%56677Nvzz*?@@AAfjj)=>>??

#78899L  
4.4.#1 	 4-5/"N 	  < 	$%%4G-|--EEEE
 		
 		
 		
G  	\ 	':=::FFFF
 
 
 	 	 	  'FWFF B B BC
 
 
 	 	 	 %::::>*5> > > ,(".	
 
    r   c                   t          |d          }|sg S g }|D ]C}t          |                              dg           pg D ]}||vr|                    |           Dt	          | d          dk    }	t          ddddt          | |		          t          |d
                   t          |d                   t          |          d|i	  	        gS )zAdvisory prose-scan: the completion summary mentions ``t_<hex>``
    ids that don't resolve. Non-blocking; surfaced as a warning only.

    Auto-clears when a fresh clean completion arrives AFTER the
    suspected event.
    !suspected_hallucinated_referencesphantom_refsr   rk   prose_phantom_refsr
   z.Completion summary references unknown task idszThe completion summary mentions task ids that don't resolve in this board's database. The completion itself succeeded, but downstream consumers parsing the summary may be pointed at cards that never existed.r   r   r   r   )	rf   rS   rH   rc   rM   r/   rr   r[   r   )
rJ   r\   r   r   r   r   r   rQ   r   rk   s
             r   _rule_prose_phantom_refsr     s    (0STTD 	 L ) )!"%%))."==C 	) 	)C,&&##C(((	) $))Y6G!>+
 *$@@@Q((tBx(($iil+    r   c                   t          |                    d|                    dd                    d          }t          |                    d          |          }t          | dd          t          | dd          nt          | dd          }|||k     rg S t          | d	d          t          | d	d          nt          | d
d          }t          | d          }	t          |d           }
d}t	          |
          D ]}t          |d          }|dv r|} ng }|dk    rh|	rf|	dk    r`|                    t          dd|	 ddd|	 did                     |                    t          dd|	 ddd|	 di                     nD|dv r@t          | d          }|r.|                    t          dd| dd| id                     |                    t          | t          | d           d!k    "                     ||d#z  k    rd$nd%}|r|pd&	                                nd&}|r"|dd'         t          |          d'k    rd(nd&z   nd&}d)d*d+d,                    |pd&d-          }|r;d.| d/| d0|                                d         dd1          }d2| d3| d4| d5| d6	}nd.| d/| d7}d2| d3| d8}t          d9||||||||||||d:;	  	        gS )<u  Task's unified ``consecutive_failures`` counter is climbing —
    something about this task+profile combo is broken and each retry
    fails the same way. Triggers regardless of the specific failure
    mode (spawn error, timeout, crash) because operationally they
    all look the same: the kernel keeps retrying and the operator
    needs to intervene.

    Threshold: cfg["failure_threshold"]. Runtime callers should derive
    this from ``kanban.failure_limit`` unless the user explicitly set a
    diagnostics threshold, so the signal does not lag behind the
    dispatcher's circuit breaker.

    Accepts the legacy ``spawn_failure_threshold`` config key for
    back-compat.
    failure_thresholdspawn_failure_threshold   failure_limitconsecutive_failuresNspawn_failuresr   last_failure_errorlast_spawn_errorassigneec                $    t          | dd          S Nr   r   rV   rs    r   <lambda>z)_rule_repeated_failures.<locals>.<lambda>@  s    k!T1.E.E r   r|   outcome>   crashed	timed_outspawn_failedr   rL   r   zVerify profile: hermes -p z doctorr   z
hermes -p Tr#   zFix profile auth: hermes -p z authrn   >   r   r   r   Check logs: hermes kanban log hermes kanban log r   rk   r      r   r   rU        …spawntimeoutcrash)r   r   r   failurezAgent z xz:    zThis task has failed z times in a row (most recent: z). Full last error:

z3

The dispatcher circuit breaker is configured for z_ consecutive non-success attempts. Fix the root cause and reclaim or unblock the task to retry.z (no error recorded)zP) but no error text was captured. Check the suggested command or the worker log.repeated_failures)r   most_recent_outcome
last_errorr   r   r   )r   rH   rM   sortedreversedrc   r   r   rr   rz   r   
splitlinesr/   )rJ   r\   r   r   r   r   r   failureslast_errr   ordered_runsr   r   ocr3   r   r   err_texterr_snippetoutcome_labelr0   r1   s                         r   _rule_repeated_failuresr     s     cgg)1--  	
 
I "#''/":":IFFM t3T::F 	D0$777/33 
 8i//	 t1488D 	D.5551488 
 4,,H
 $$E$EFFFLl##  I&&999"$E : ')Gn,,,h)>S>S'@x@@@ >X > > >?	
 
 
 	 	 	 	'@@@@ <X < < <=
 
 
 	 	 	 	
 
 8	8	8 dD)) 	NN+@w@@"$B$B$BC	      NN,k$11Y>      &Q66zzGH+3;B%%'''HMU](4C4.S]]S-@-@EEbII[]K  
c

#Y//	 
  
YYY(YYk6L6L6N6Nq6QRVSVRV6WYYDH D D*D DD D 	D D D 	 IHH(HHHHH H H*H H H 	
  $,#6"!**
 
    r   c                   t          |                    d|                    dd                              }t          | dd          pd}||k    rg S t          |                    dd                    }t          |d 	          }d}	d
}
t	          |          D ]6}t          |d          }|dk    r|	dz  }	|
t          |d          }
0|dv r n7|	|k     rg S t          | d          }g }|r.|                    t          dd| dd| id                     t          | d          dk    }|                    t          | |                     |	|dz  k    rdnd}|
r|
pd	                                nd}|r"|d
d         t          |          dk    rdndz   nd}|r1d|	 d|                                d         d
d           }d!|	 d"| }nd|	 d#}d!|	 d$}t          d%|||||||	|	|
d&'	  	        gS )(uQ  The worker spawns fine but keeps crashing mid-run. Check the last
    N runs' outcomes; N consecutive ``crashed`` without a successful
    ``completed`` means something about the task + profile combo is
    broken (OOM, missing dependency, tool it needs is down).

    Threshold: cfg["crash_threshold"] (default 2).

    Narrower than ``repeated_failures`` — fires earlier (2 crashes vs 3
    total failures) so the operator gets a crash-specific heads-up
    before the unified rule kicks in. Suppresses itself when the
    unified rule is also about to fire, to avoid double-flagging.
    r   r   r   r   r   crash_thresholdr   c                $    t          | dd          S r   rV   r   s    r   r   z(_rule_repeated_crashes.<locals>.<lambda>  s    Qa)@)@ r   r   Nr   r   r7   r   >   ra   	reclaimedr   r   r   r   r   Tr#   r   rk   r   r   rU   r   r   zAgent crashed zx: r   z	The last z4 runs ended with outcome=crashed. Full last error:

zx (no error recorded)z_ runs ended with outcome=crashed but no error text was captured. Check the worker log for more.repeated_crashes)consecutive_crashesr   r   )r4   rH   rM   r   r   rc   r   r   rr   rz   r   r   r/   )rJ   r\   r   r   r   r   unified_counterr   orderedconsecutiver   r   r   r   r3   rk   r   r   r   r0   r1   s                        r   _rule_repeated_crashesr     s    CGG)1--   
 	D0!449  +++	CGG-q1122IT@@AAAGKHg  a++i1K&q'22222E
 Y	$%%G&(G '<7<< >W > >?	
 
 
 	 	 	 $))Y6GNN,T7CCCDDD(IM99zzwH ,4;B%%'''HMU](4C4.S]]S-@-@EEbII[]K 
TTT1G1G1I1I!1LTcT1RTT1 1 1#.1 1 	
 DCCCJ J J J 	 %0II
 
 
 
 
r   c                2   t          |                    dd                    }t          | d          }|dk    rg S d}|D ]4}t          |          dk    rt	          |          }	t          ||	          }5|dk    rg S ||z
  dz  }
|
|k     rg S |D ]*}t          |          dv rt	          |          |k    rg c S +t          dd	d
          g}t          dddt          |
           ddt          |
           d|||d|t          |
d          d	  	        gS )zTask has been in ``blocked`` status for too long without a comment.

    Threshold: cfg["blocked_stale_hours"] (default 24).
    Surfaced as a warning so humans know there's a pending unblock.
    blocked_stale_hours   r   blockedr   g      @>   	commented	unblockedr   z Add a comment / unblock the taskTr   stuck_in_blockedr
   zTask has been blocked for hz"This task transitioned to blocked u   h ago and has had no comments or unblock attempts since. Blocked tasks are waiting for human input — check the block reason and either unblock with feedback or answer with a comment.r7   )
blocked_at	age_hoursr   )
floatrH   rM   rW   r[   rh   r   r/   r4   round)rJ   r\   r   r   r   hoursr   last_blocked_tsrQ   rZ   r  r3   s               r   _rule_stuck_in_blockedr    s    #''/4455Ex((F	O 6 6r??i''"A!/155O!	&&0I5	  r??888Yr]]_=\=\III4	
 	
 	
'G <3y>><<<FY F F F
 %$+%	1:M:MNN    r   c                   t          |                    d          d          }t          |                    dd                    }||z
  }d}d}	d}
d}|D ]I}t          |          }||k     rt	          |          }|dk    r|
dk    r|}
|	r	|dz  }|}d}	A|d	k    rd
}	J||k     rg S t          | d          }g }|r.|                    t          dd| dd| id
                     t          ddd| dt          |dz             dd| d||
rt          |
          nt          |          |rt          |          nt          |          ||t          |          d	  	        gS )ut  Task has cycled through blocked → unblocked many times — the
    ``unblock`` is not fixing the underlying problem and the worker
    keeps re-blocking for substantially the same reason.

    ``_rule_stuck_in_blocked`` resets its timer on any ``commented`` /
    ``unblocked`` event, so a task that cycles every few minutes is
    invisible to it regardless of how many times it cycles (#29747
    gap 1). This rule complements that one by counting block→unblock
    cycles in a sliding window.

    Threshold: cfg["block_cycle_threshold"] (default 3) cycles within
    cfg["block_cycle_window_seconds"] (default 24h).
    block_cycle_thresholdr   block_cycle_window_secondsiQ r   Fr   r7   r   Tr   r   z*Check block reasons: hermes kanban events r   zhermes kanban events r#   block_unblock_cyclingr
   u   Task block→unblock cycled zx in   r   zThis task has been blocked z times after being unblocked, suggesting the unblock is not addressing the root cause and the worker keeps hitting the same wall. Review the block reasons in the event history; a different intervention (reassign, change scope, archive) may be needed.)cycleswindow_secondsr   )
r   rH   r  r[   rW   rM   rc   r   r/   r4   )rJ   r\   r   r   r   r   r  cycle_cutoffr  seen_unblock_since_last_cycleinitial_blocked_tslast_cycle_blocked_tsrQ   tsr   r   r3   s                    r   _rule_block_unblock_cyclingr    s    cgg&=>>BBI377#?KKLLN'L F$)! 1 1r]]29!Q&&%'", 6!(*%05-[  ,0)		$%%G&(G 'HwHH A A AB	
 
 
 	 	 	 $UVUU#nT>Q:R:RUUUL& L L L 1CQc,---S3HVS.///cRUhh!.11
 
    r   c                .   t          |                    dd                    }t          | d          }|dk    rg S t          | d          rg S t          | d          pd}|                                sg S h d}d	}	|D ]2}
t	          |
          |v rt          |
          }t          |	|          }	3|	d	k    r!t          t          | d
d	          pd	          }	|	d	k    rg S ||	z
  }||k     rg S |dk    r
|dz  dd}nt          |dz             d}||dz  k    rd}n||dz  k    rd}nd}t          ddd|i          t          ddddi          g}t          d|d| d d!| d"|d#||	|	d$|	t          |          |t          |          d%&	  	        gS )'u  Task has been in ``ready`` status for too long without any worker
    claiming it.

    Threshold: cfg["stranded_threshold_seconds"] (default 1800 = 30 min).

    Catches every "task waiting for a worker that never comes" case
    without caring WHY:

    * Operator typo'd the assignee — no profile or external worker matches.
    * Profile was deleted, leaving its tasks stranded.
    * External worker pool (Codex CLI, Claude Code lane, custom daemon)
      is down, hung, or wasn't started.
    * Dispatcher is misconfigured (wrong board, wrong HERMES_HOME).

    Pre-rule, all of these silently rotted in ``skipped_nonspawnable`` —
    the dispatcher correctly skipped them (good — no respawn loop) but
    nobody surfaced the fact that operator-actionable work was
    accumulating. The rule fires when a ready task's promoted-to-ready
    timestamp is older than the threshold AND the assignee is non-empty
    (truly unassigned tasks have their own ``skipped_unassigned`` signal
    on the dispatcher and a different operator response).

    The signal is age-based on purpose: it's identity-agnostic, so it
    works for Hermes profiles, registered lanes, external workers, and
    typos uniformly. No registry to curate, no per-board allowlist.
    stranded_threshold_seconds  r   ready
claim_lockr   rU   >   createdpromotedr   r   r   rY   )rL   r  z.1fr   <   m   r   r   r   r
   ro   zReassign to a different workercurrent_assigneern   r   zCheck dispatcher statusr   zhermes kanban diagnosticsstranded_in_readyz
Ready for z with no workerzThis task has been ready for z5 but nothing has claimed it. Common causes: assignee z is misspelled, the profile was deleted, or the external worker pool for this lane is down. Confirm the assignee is correct and that a worker is actually polling for it.r7   )ready_sinceage_secondsr   threshold_secondsr   )
r  rH   rM   rz   rW   r[   rh   r4   r   r/   )rJ   r\   r   r   r   r#  r   r   READY_TRANSITION_KINDSlast_ready_tsrQ   rZ   r"  age_strr   r3   s                   r   _rule_stranded_in_readyr'  g  s   6 ,g66  x((F	 4&& 	4,,2H>>  	   M 2 2r??444"Aq11M
 KlAFFFK!LL	%K&&&	 d 4'....r)**---
 '!+++	)A-	-	- 	2'2	
 	
 	

 	+ ;<	
 	
 	
G  37333HG H H3;H H H #"({++ !$%6!7!7	
 
    r   zlist[RuleFn]_RULES)r   r   r   r   r   r   r  r   r   r   r  )r   r   r   r   r  r   c                    | pi } t          |                     d          pi           }|                    d|                     dt          d                              d|vrd|vr|d         |d<   |S )aF  Build diagnostics config from the runtime ``kanban`` config section.

    ``kanban.diagnostics.failure_threshold`` remains an explicit override.
    Otherwise, derive the repeated-failure threshold from
    ``kanban.failure_limit`` so CLI/dashboard diagnostics match the
    dispatcher's actual circuit-breaker threshold.
    diagnosticsr   r   r   )r   rH   
setdefaultDEFAULT_CONFIG)r   diag_cfgs     r   config_from_kanban_configr.    s     !rJJNN=117R88H7J(KLL  
 	8++%X55(0(A$%Or   c                $   | pi } t          | t                    si S i }|                     d          }t          |t                    r'|                    t	          |                     ||d<   dD ]}|                     |          }||||<   |S )a]  Build diagnostics config from the full Hermes runtime config.

    Carries through ``kanban``, ``auxiliary``, and ``model`` keys so triage-
    aware rules can inspect the active aux-helper and main-model state.
    Folds the ``kanban`` block through ``config_from_kanban_config`` so the
    repeated-failure threshold derivation still applies.
    r   )r   rw   )rG   r   rH   updater.  )r~   r   r   r|   r   s        r   config_from_runtime_configr1  #  s     !rJj$'' 	C))J*d## #

,Z88999"H%  s##CHJr   )r   r   rB   r   r   r9   c          
       	 t          ||nt          j                              }|pi }i t          |}d|vr9d|vr5d|v r1t          |                    d          t          d                   |d<   g }t
          D ]5}	 |                     || ||||                     &# t          $ r Y 2w xY wd t          t                    D             	|
                    	fd           |S )zRun every rule against a single task's state and return a
    severity-sorted list of active diagnostics.

    Sorting: critical first, then error, then warning; ties broken by
    most-recent ``last_seen_at``.
    Nr   r   r   c                    i | ]\  }}||	S r-   r-   )r>   iss      r   
<dictcomp>z,compute_task_diagnostics.<locals>.<dictcomp>]  s    ???TQAq???r   c                P                         | j        d           | j        pd fS )Nr   r   )rH   r   r6   )dseverity_idxs    r   r   z*compute_task_diagnostics.<locals>.<lambda>_  s/    aj"---n!"
 r   r   )r4   timer,  r   rH   r(  r   rF   	enumerater   sort)
rJ   r\   r   r   r   now_tsr   rq   ruler9  s
            @r   compute_task_diagnosticsr?  :  s>    TY[[99F\rF
&^
&v
&C6))%V33v%%#0JJ''./$
 $
  C  	JJttD&$<<==== 	 	 	 H		
 @?Y~%>%>???LHH
 
 
 
     Js   >"B!!
B.-B.r*  Iterable[Diagnostic]c                    d}d}| D ]@}|j         t          v rt                              |j                   nd}||k    r	|}|j         }A|S )zlHighest severity present in the list, or None if empty. Useful
    for card badges that need a single color.r   N)r   r   r   )r*  highest_idxhighestr8  idxs        r   severity_of_highestrE  g  sa     KG ! !23*2N2Nn""1:...TVKjGNr   )r   r   r   r   r   r   )Nr'   )r   r   )r   r4   )r\   r]   r   r   r   r^   )r\   r]   r   r4   )rJ   r   rk   r   r   r2   )rs   r   r   r   )r~   r   r   r   )r   r   r   r   )r   r   rL   r4   r   r4   )r   r   )r   r   r   r   )r~   r   r   r   )
r\   rB   r   rB   r   r9   r   r   r   r   )r*  r@  r   r   )0r+   
__future__r   dataclassesr   r   typingr   r   r   r	   rO   r:  r   r   r   r/   rM   rS   rW   r[   rf   rj   rr   rB   r4   r   RuleFnr}   r   r   r   r   r   r   r   r   r  r  r'  r(  r,   DIAGNOSTIC_KINDSr,  r.  r1  r?  rE  r-   r   r   <module>rK     s    : # " " " " " ( ( ( ( ( ( ( ( 4 4 4 4 4 4 4 4 4 4 4 4   2M M M M 
 
 
 
 
 
 
 
D 
 
 
 
 
 
 
 
F( ( ( (0   - - - -   
   0   $   2 
3S	49c48$z:JJ	K   &. . . .,0 0 0 0f. . . ., , , ,^m m m m`   Dw w w wtT T T Tn/ / / /dJ J J JZx x x x|  	 	 	 	 		    #*    ,   8 !* * * * * *Z
 
 
 
 
 
r   