-- texdoclib-score.tlu: scoring functions for texdoc
--
-- The TeX Live Team, GPLv3, see texdoclib.tlu for details
-- dependencies
local md5 = require 'md5'
local texdoc = {
    const = require 'texdoclib-const',
    util = require 'texdoclib-util',
    config = require 'texdoclib-config',
}
-- shortcuts
local M = {}
-- shared variables
local global_adjscore, spec_adjscore = {}, {}
-------------------------   configuration directives   -------------------------
-- set key in score table to val, without overriding
local function set_score_table(tab, key, val)
    local k = string.lower(key)
    local v = tonumber(val)
    if v then
        if tab[k] == nil then tab[k] = v end
        return true
    end
    return false
end
-- interpret a confline as a score directive or return false
function M.confline_to_score(line)
    local keyw, pat, val
    -- try global adjscore
    pat, val = string.match(line, '^adjscore%s+([%w%p]+)%s*=%s*([%d+-.]+)')
    if pat and val then
        return set_score_table(global_adjscore, pat, val)
    end
    -- try keyword specific adjscore
    keyw, pat, val = string.match(line,
        '^adjscore%(([%w%p]+)%)%s+([%w%p]+)%s*=%s*([%d+-.]+)')
    if keyw and pat and val then
        keyw = string.lower(keyw)
        spec_adjscore[keyw] = spec_adjscore[keyw] or {}
        return set_score_table(spec_adjscore[keyw], pat, val)
    end
    return false
end
----------------------------   score computation   -----------------------------
-- parse filename into , , 
local function parse(filename)
    local base, lang, ext
    ext = texdoc.util.get_ext(filename)
    if ext ~= nil and ext ~= '' then
        base = filename:sub(1, -#ext - 2)
    else
        base = filename
    end
    for lc, _ in pairs(texdoc.const.lang_codes) do
        local hyph_lc = '-' .. lc
        if base:sub(-#hyph_lc) == hyph_lc then
            return base:sub(1, -#hyph_lc - 1), lc, ext
        end
    end
    return base, lang, ext
end
-- says if pat is a "subword" of str
local function is_subword(str, pat)
    local function is_delim(str, i)
        return not not string.find(string.sub(str, i, i), '%p')
    end
    local i, j = string.find(str, pat, 1, true)
    return not not (i and j
        and (i == 1 or is_delim(str, i) or is_delim(str, i-1))
        and (j == #str or is_delim(str, j) or is_delim(str, j+1)))
end
-- says if a filename has a bad basename
local function has_bad_basename(file)
    file = file:gsub('.*/', '')
    for _, b in ipairs(texdoc.config.get_value('badbasename_list')) do
        if file:find('^' .. b .. '$') or file:find('^' .. b .. '%.') then
            return true
        end
    end
    return false
end
-- compute a pattern score -10 <= s < 10
local function pattern_score(name, pat, dbg_score)
    dbg_score('Start heuristic scoring with pattern: ' .. pat)
    -- score management
    local score = -10
    local function upscore(s, reason, force)
        if s > score or force then
            score = s
            dbg_score('New heuristic score: %.1f. Reason: %s', s, reason)
        end
    end
    -- look for exact or subword match
    if M.is_exact(name, pat) then
        upscore(4, 'exact match')
    elseif is_subword(name, pat) then
        upscore(1, 'subword match')
    end
    -- try derivatives unless pat contains a slash
    local slash = not not string.find(pat, '/', 1, true)
    if not slash then
        for _, suffix in ipairs(texdoc.config.get_value('suffix_list')) do
            local deriv = pat .. suffix
            if M.is_exact(name, deriv) then
                upscore(4.5, 'exact match for derived pattern: ' .. deriv)
            elseif is_subword(name, deriv) then
                upscore(3.5, 'subword match for derived pattern: ' .. deriv)
            end
        end
    end
    -- if extension is bad, score becomes an epsilon
    local ext = texdoc.config.get_value('ext_list')[M.ext_pos(name)]
    if ext and texdoc.config.get_value('badext_list_inv')[ext] and score > 0 then
        upscore(0.1, 'bad extension', true)
    end
    -- if basename is bad, score becomes an epsilon
    if has_bad_basename(name) and score > 0 then
        upscore(0.1, 'bad basename', true)
    end
    -- bonus for being in the right directory
    if string.find('/' .. name, '/' .. pat .. '/', 1, true) and not slash then
        upscore(score + 1.5, 'directory bonus')
    end
    -- done
    dbg_score('Final heuristic score: %.1f', score)
    return score
end
-- set the score of a docfile
local function set_score(df, original_kw)
    -- scoring is case-insensitive (patterns are already lowercased)
    local name = df.normname:lower()
    local df_hash = md5.sumhexa(name):sub(1, 7)  -- we use normname hash for cross-platform consistency
    -- special debugging function
    local function dbg_score(msg, ...)
        -- add the hash id prefix to make the outputs grep-friendly
        local msg = string.format('(%s) ', df_hash) .. msg
        texdoc.util.dbg_print('score', msg, ...)
    end
    dbg_score('Start scoring ' .. texdoc.util.w32_path(df.realpath))
    dbg_score('Name used: ' .. name)
    -- get score from patterns
    local score = -10
    local is_alias = false
    for _, pat in ipairs(df.matches) do
        local s = -10
        local p = string.lower(pat.name)
        if pat.original then  -- non-alias
            if df.tree > -1 then
                s = pattern_score(name, p, dbg_score)
            else
                s = 1
            end
        elseif M.is_exact(name, p) then  -- alias
            is_alias = true
            local bonus, note = 0, ''
            if pat.locale then
                bonus, note = 5, ', (language-based)'
            end
            s = (pat.score or 10) + bonus  -- default alias score is 10
            dbg_score('Matching alias "%s", score: %.1f%s', pat.name, s, note)
        end
        if s > score then score = s end
    end
    dbg_score('Max pattern score: %.1f', score)
    -- get score from tlp associations
    if score == -10 and df.tlptodoc then
        score = -1
        dbg_score('New score: %.1f from package name association', score)
    end
    if score == -10 and df.runtodoc then
        score = -5
        dbg_score('New score: %.1f from sty/cls association', score)
    end
    -- bonus for metadata
    if df.details then
        if string.find(string.lower(df.details), 'readme') then
            score = score + 0.1
            dbg_score('Catalogue "readme" bonus: +0.1')
        else
            score = score + 1.5
            dbg_score('Catalogue details bonus: +1.5')
        end
    end
    -- bonus for locale
    local config_lang = texdoc.config.get_value('lang')
    if not is_alias then
        local file_lang
        -- from its catalogue
        if df.lang then
            -- take first two letters; it may have country codes
            file_lang = df.lang:sub(1, 2)
        end
        -- from its filename
        if not file_lang then
            _, file_lang, _ = parse(name)
            file_lang = texdoc.const.lang_codes[file_lang]
        end
        if config_lang ~= nil and config_lang == file_lang then
            score = score + 1
            dbg_score('Locale match bonus: +1.0')
        elseif file_lang ~= nil and file_lang ~= 'en' then
            -- normally, english documents do not have file_lang,
            -- but sometimes catalogue includes en info (e.g., geometry)
            -- we want to treat both cases similar
            score = score - 0.1
            dbg_score('Locale unmatch: -0.1')
        end
    end
    -- adjust from keyword-specific tables
    if df.tree > -1 and spec_adjscore[original_kw] then
        for pat, val in pairs(spec_adjscore[original_kw]) do
            if val and is_subword('/' .. name, pat) then
                score = score + val
                dbg_score('Adjust by %.1f from specific pattern "%s"', val, pat)
            end
        end
    end
    -- adjust from global tables
    if df.tree > -1 then
        for pat, val in pairs(global_adjscore) do
            if val and is_subword('/' .. name, pat) then
                if score > -10 or val < 0 then score = score + val end
                dbg_score('Adjust by %.1f from global pattern "%s"', val, pat)
            end
        end
    end
    dbg_score('Final score: %.1f', score)
    -- the final score should be a float value
    df.score = score + 0.0
end
-- set the scores for a doclist
local function set_list_scores(list, original_kw)
    for _, df in ipairs(list) do
        set_score(df, original_kw)
    end
end
-- says if filename is an exact match for pat
function M.is_exact(filename, pattern)
    local f_base, f_lang, f_ext = parse(filename)
    local p_base, p_lang, p_ext = parse(pattern)
    -- if the pattern contains lang, check if identical
    if p_lang ~= nil and f_lang ~= p_lang then
        return false
    end
    -- if the pattern contains ext, check if identical
    if p_ext ~= nil and p_ext ~= '' and f_ext ~= p_ext then
        return false
    end
    -- finally check the bases
    if (f_base == p_base
        or (f_base:sub(-#p_base) == p_base
            and f_base:sub(-#p_base - 1, -#p_base - 1) == '/')) then
        return true
    else
        return false
    end
end
-- compare two docfile's: (see texdoclib-search.tlu for structure)
-- 1. by score
-- 2. then by extensions (ordered as in ext_list),
-- 3. then lexicographically by normname.
-- 4. then by tree.
-- return true if a is better than b
local function docfile_order(a, b)
    if     a.score > b.score       then return true
    elseif a.score < b.score       then return false
    elseif a.ext_pos < b.ext_pos   then return true
    elseif a.ext_pos > b.ext_pos   then return false
    elseif a.normname < b.normname then return true
    elseif a.normname > b.normname then return false
    else return (a.tree > b.tree)
    end
end
-----------------------------   public functions   -----------------------------
-- returns the index of the most specific extension of file in ext_list,
-- or config.ext_list_max + 1
function M.ext_pos(filename)
    -- remove zipext if applicable
    filename = texdoc.util.parse_zip(filename)
    -- now find the extension
    local p, e, pos, ext
    for p, e in ipairs(texdoc.config.get_value('ext_list')) do
        if (e == '*') and (ext == nil) then
            pos, ext = p, e
        elseif (e == '') and not filename:find('.', 1, true) then
            pos, ext = p, e
        elseif filename:sub(-e:len() - 1) == '.' .. e then
            if (ext == nil) or (ext == '*') or (e:len() > ext:len()) then
                pos, ext = p, e
            end
        end
    end
    return pos or (texdoc.config.get_value('ext_list_max') + 1)
end
-- return the "quality" of docfile
function M.docfile_quality(df)
    if df.score > 0 then
        return 'good'
    elseif df.score > -100 then
        return 'bad'
    else
        return 'killed'
    end
end
-- sort a doclist
function M.sort_doclist(dl, original_kw)
    dl:stop()
    set_list_scores(dl, original_kw)
    table.sort(dl, docfile_order)
end
return M
-- vim: ft=lua: