package org.wikipedia.miner.annotation.preprocessing;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument;

/* loaded from: input_file:org/wikipedia/miner/annotation/preprocessing/HtmlPreprocessor.class */
public class HtmlPreprocessor extends DocumentPreprocessor {
    static String[] defaultRegionTags = {"div", "table"};
    static String[] defaultSplitterTags = {"h1", "h2"};

    public HtmlPreprocessor() {
        super(getStartTagRegex(defaultRegionTags), getEndTagRegex(defaultRegionTags), getTagRegex(defaultSplitterTags));
    }

    public HtmlPreprocessor(String[] strArr, String[] strArr2) {
        super(getStartTagRegex(strArr), getEndTagRegex(strArr), getTagRegex(strArr2));
    }

    @Override // org.wikipedia.miner.annotation.preprocessing.DocumentPreprocessor
    public PreprocessedDocument preprocess(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        String lowerCase = str.toLowerCase();
        ArrayList<PreprocessedDocument.RegionTag> regionTags = getRegionTags(lowerCase);
        int indexOf = lowerCase.indexOf("<body");
        if (indexOf < 0) {
            indexOf = 0;
        }
        Matcher matcher = Pattern.compile("<title([^>]*)>(.*?)</title>", 32).matcher(lowerCase.substring(0, indexOf));
        while (matcher.find()) {
            stringBuffer.append(matcher.group(2));
            stringBuffer.append(".\n");
        }
        Matcher matcher2 = Pattern.compile("<meta(.*?)/>", 32).matcher(lowerCase.substring(0, indexOf));
        while (matcher2.find()) {
            String group = matcher2.group();
            String attributeValue = getAttributeValue(group, "name");
            String attributeValue2 = getAttributeValue(group, "content");
            if (attributeValue.equals("keywords") || attributeValue.equals("description")) {
                if (!attributeValue2.equals("")) {
                    stringBuffer.append(attributeValue2);
                    stringBuffer.append("\n");
                }
            }
        }
        String substring = str.substring(indexOf);
        Matcher matcher3 = Pattern.compile("<a([^>]*)>(.*?)</a>", 32).matcher(substring);
        int i = 0;
        StringBuffer stringBuffer2 = new StringBuffer();
        while (matcher3.find()) {
            stringBuffer2.append(substring.substring(i, matcher3.start()));
            stringBuffer2.append(getSpaceString(matcher3.group().length()));
            String trim = clearAllMentions("<(.*?)>", matcher3.group(2)).trim();
            if (!trim.equals("")) {
                stringBuffer.append(trim);
                stringBuffer.append(".\n");
            }
            i = matcher3.end();
        }
        stringBuffer2.append(substring.substring(i));
        return new PreprocessedDocument(str, getSpaceString(indexOf) + clearAllMentions("&\\w{2,6};", clearAllMentions("<(.*?)>", clearAllMentions("<script(.*?)</script>", clearAllMentions("<!--(.*?)-->", stringBuffer2.toString())))), stringBuffer.toString(), regionTags, null);
    }

    private String getAttributeValue(String str, String str2) {
        Matcher matcher = Pattern.compile(str2 + "\\W*=\\W*\"(.*?)\"", 32).matcher(str);
        return matcher.find() ? matcher.group(1) : "";
    }

    public static Pattern getStartTagRegex(String[] strArr) {
        if (strArr == null || strArr.length == 0) {
            return null;
        }
        if (strArr.length == 1) {
            return Pattern.compile("<" + strArr[0] + "[^>]*>", 2);
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : strArr) {
            stringBuffer.append(str + "|");
        }
        stringBuffer.deleteCharAt(stringBuffer.length() - 1);
        return Pattern.compile("<(" + stringBuffer.toString() + ")[^>]*>", 2);
    }

    public static Pattern getEndTagRegex(String[] strArr) {
        if (strArr == null || strArr.length == 0) {
            return null;
        }
        if (strArr.length == 1) {
            return Pattern.compile("</" + strArr[0] + "[^>]*>", 2);
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : strArr) {
            stringBuffer.append(str + "|");
        }
        stringBuffer.deleteCharAt(stringBuffer.length() - 1);
        return Pattern.compile("</(" + stringBuffer.toString() + ")[^>]*>", 2);
    }

    public static Pattern getTagRegex(String[] strArr) {
        if (strArr == null || strArr.length == 0) {
            return null;
        }
        if (strArr.length == 1) {
            return Pattern.compile("</*" + strArr[0] + "[^>]*>", 2);
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : strArr) {
            stringBuffer.append(str + "|");
        }
        stringBuffer.deleteCharAt(stringBuffer.length() - 1);
        return Pattern.compile("</*(" + stringBuffer.toString() + ")[^>]*>", 2);
    }
}
