package org.wikipedia.miner.annotation.preprocessing;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument;
import org.wikipedia.miner.model.Article;
import org.wikipedia.miner.model.Wikipedia;

/* loaded from: input_file:org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.class */
public class WikiPreprocessor extends DocumentPreprocessor {
    private Wikipedia wikipedia;

    public WikiPreprocessor(Wikipedia wikipedia) {
        super(null, null, Pattern.compile("={2,}([^=]+)={2,}"));
        this.wikipedia = wikipedia;
    }

    @Override // org.wikipedia.miner.annotation.preprocessing.DocumentPreprocessor
    public PreprocessedDocument preprocess(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        ArrayList<PreprocessedDocument.RegionTag> regionTags = getRegionTags(str);
        HashSet<Integer> hashSet = new HashSet<>();
        return new PreprocessedDocument(str, clearAllMentions("&\\w{2,6};", clearAllMentionsRetainFirstCharacter("\n([\\*\\#]+)", clearAllMentionsRetainFirstCharacter("\n:+", clearAllMentions("'{2,}", clearAllMentions("\\[(http|www)(.*?)\\]", clearAllMentions("<(.*?)>", clearAllMentions("(?s)<ref\\s(.*?)>(.*?)</ref>", clearAllMentions("(?s)<ref>(.*?)</ref>", clearAllMentions("<ref\\\\>", clearAllMentions("(?s)\\<\\!\\-\\-(.*?)\\-\\-\\>", blankSectionHeaders(blankLinks(blankTables(blankTemplates(str)), stringBuffer, hashSet), stringBuffer))))))))))), stringBuffer.toString(), regionTags, hashSet);
    }

    private String blankSectionHeaders(String str, StringBuffer stringBuffer) {
        Matcher matcher = Pattern.compile("(={2,})([^=]+)\\1").matcher(str);
        int i = 0;
        StringBuffer stringBuffer2 = new StringBuffer();
        while (matcher.find()) {
            stringBuffer2.append(str.substring(i, matcher.start()));
            stringBuffer2.append(getSpaceString(matcher.group().length()));
            String trim = matcher.group(2).trim();
            if (!trim.equalsIgnoreCase("see also") && !trim.equalsIgnoreCase("external links") && !trim.equalsIgnoreCase("references") && !trim.equalsIgnoreCase("further reading")) {
                stringBuffer.append("\n" + trim);
            }
            i = matcher.end();
        }
        stringBuffer2.append(str.substring(i));
        return stringBuffer2.toString();
    }

    private String blankTemplates(String str) {
        Vector vector = new Vector();
        Matcher matcher = Pattern.compile("(\\{\\{|\\}\\})").matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        while (matcher.find()) {
            if (str.substring(matcher.start(), matcher.end()).equals("{{")) {
                vector.add(Integer.valueOf(matcher.start()));
            } else if (!vector.isEmpty()) {
                int intValue = ((Integer) vector.lastElement()).intValue();
                vector.remove(vector.size() - 1);
                if (vector.isEmpty()) {
                    stringBuffer.append(str.substring(i, intValue));
                    for (int i2 = intValue; i2 < matcher.end(); i2++) {
                        stringBuffer.append(" ");
                    }
                    i = matcher.end();
                }
            }
        }
        if (!vector.isEmpty()) {
            System.err.println("WikiPreprocessor | Warning: templates were not well formed, so we cannot guarantee that they were stripped out correctly. ");
        }
        stringBuffer.append(str.substring(i));
        return stringBuffer.toString();
    }

    private String blankTables(String str) {
        Vector vector = new Vector();
        Matcher matcher = Pattern.compile("(\\{\\||\\|\\})").matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        while (matcher.find()) {
            if (str.substring(matcher.start(), matcher.end()).equals("{|")) {
                vector.add(Integer.valueOf(matcher.start()));
            } else if (!vector.isEmpty()) {
                int intValue = ((Integer) vector.lastElement()).intValue();
                vector.remove(vector.size() - 1);
                if (vector.isEmpty()) {
                    stringBuffer.append(str.substring(i, intValue));
                    for (int i2 = intValue; i2 < matcher.end(); i2++) {
                        stringBuffer.append(" ");
                    }
                    i = matcher.end();
                }
            }
        }
        if (!vector.isEmpty()) {
            System.err.println("WikiPreprocessor | Warning: tables were not well formed, so we cannot guarantee that they were stripped out correctly. ");
        }
        stringBuffer.append(str.substring(i));
        return stringBuffer.toString();
    }

    private String blankLinks(String str, StringBuffer stringBuffer, HashSet<Integer> hashSet) {
        Vector vector = new Vector();
        Matcher matcher = Pattern.compile("(\\[\\[|\\]\\])").matcher(str);
        StringBuffer stringBuffer2 = new StringBuffer();
        int i = 0;
        while (matcher.find()) {
            if (str.substring(matcher.start(), matcher.end()).equals("[[")) {
                vector.add(Integer.valueOf(matcher.start()));
            } else if (!vector.isEmpty()) {
                int intValue = ((Integer) vector.lastElement()).intValue();
                vector.remove(vector.size() - 1);
                if (vector.isEmpty()) {
                    stringBuffer2.append(str.substring(i, intValue));
                    for (int i2 = intValue; i2 < matcher.end(); i2++) {
                        stringBuffer2.append(" ");
                    }
                    processLink(str.substring(intValue + 2, matcher.start()), stringBuffer, hashSet);
                    i = matcher.end();
                }
            }
        }
        if (!vector.isEmpty()) {
            System.err.println("WikiPreprocessor| Warning: links were not well formed, so we cannot guarantee that they were stripped out correctly. ");
        }
        stringBuffer2.append(str.substring(i));
        return stringBuffer2.toString();
    }

    private String clearAllMentionsRetainFirstCharacter(String str, String str2) {
        Matcher matcher = Pattern.compile(str, 34).matcher(str2);
        int i = 0;
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            stringBuffer.append(str2.substring(i, matcher.start()));
            stringBuffer.append(str2.charAt(matcher.start()));
            for (int i2 = 1; i2 < matcher.group().length(); i2++) {
                stringBuffer.append(" ");
            }
            i = matcher.end();
        }
        stringBuffer.append(str2.substring(i));
        return stringBuffer.toString();
    }

    private void processLink(String str, StringBuffer stringBuffer, HashSet<Integer> hashSet) {
        if (str.indexOf(":") > 0) {
            return;
        }
        String str2 = str;
        String str3 = str;
        int lastIndexOf = str.lastIndexOf("|");
        if (lastIndexOf > 0) {
            str2 = str.substring(lastIndexOf + 1);
            str3 = str.substring(0, lastIndexOf);
        }
        stringBuffer.append("\n" + str2);
        Article articleByTitle = this.wikipedia.getArticleByTitle(str3);
        if (articleByTitle != null) {
            hashSet.add(Integer.valueOf(articleByTitle.getId()));
        }
    }
}
