import spacy
from spacy.training import Example
from spacy_legacy.scorers import score_cats_v1


def test_score_cats_v1():
    nlp = spacy.blank("en")
    ref = nlp("one")
    ref.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 0.0}
    pred = nlp("one")
    pred.cats = {"winter": 0.35, "summer": 0.25, "spring": 0.2, "autumn": 0.2}

    # with the previous threshold of 0.5 provided in the default textcat config,
    # this example is counted as incorrect even though winter should be the
    # "positive" prediction
    scores = score_cats_v1(
        [Example(pred, ref)],
        "cats",
        labels=["winter", "summer", "spring", "autumn"],
        multi_label=False,
        threshold=0.5,
    )
    assert scores["cats_micro_f"] == 0.0

    # with no provided threshold, the score is correct
    # (note that cats_score is incorrectly 0.25 because there are no examples
    # for the other labels and the default F-score for nothing with PRFScore is
    # 0.0 rather than undefined)
    scores = score_cats_v1(
        [Example(pred, ref)],
        "cats",
        labels=["winter", "summer", "spring", "autumn"],
        multi_label=False,
        threshold=None,
    )
    assert scores["cats_micro_f"] == 1.0