import parse from "node-html-parser";



export interface HtmlFeatures {
    style?: string;
    class?: string;
    otherAttributes: Record<string, string>;
    tag: string;
}



export function getHtmlFeatures(pages: string[]): Set<HtmlFeatures> {
    const features = new Set<string>();
    for (const page of pages) {
        convertPageToHtmlFeatures(page).forEach(f => features.add(f));
    }
    const convertedFeatures = new Set<HtmlFeatures>();
    features.forEach(f => convertedFeatures.add(JSON.parse(f)));
    return convertedFeatures;
}

function convertPageToHtmlFeatures(page: string): Set<string> {
    const root = parse(page);
    const features = new Set<string>();
    const body = root.querySelector("body");
    if (!body) return features;

    const nodes = body.querySelectorAll("*");
    for (const node of nodes) {
        const feature: HtmlFeatures = {
            tag: node.tagName.toLowerCase(),
            otherAttributes: {}
        };

        for (const [name, val] of Object.entries(node.attributes)) {
            if (name !== "style" && name !== "class") {
                feature.otherAttributes[name] = "*";
            } else if (name === "style") {
                feature.style = val;
            } else if (name === "class") {
                feature.class = val;
            }
        }

        features.add(JSON.stringify(feature));
    }
    return features;
}