package org.wikipedia.miner.annotation;

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.wikipedia.miner.annotation.ArticleCleaner;
import org.wikipedia.miner.comparison.ArticleComparer;
import org.wikipedia.miner.db.WDatabase;
import org.wikipedia.miner.model.Article;
import org.wikipedia.miner.model.Label;
import org.wikipedia.miner.model.Wikipedia;
import org.wikipedia.miner.util.ArticleSet;
import org.wikipedia.miner.util.NGrammer;
import org.wikipedia.miner.util.Position;
import org.wikipedia.miner.util.ProgressTracker;
import org.wikipedia.miner.util.RelatednessCache;
import org.wikipedia.miner.util.Result;
import org.wikipedia.miner.util.WikipediaConfiguration;
import org.wikipedia.miner.util.text.TextProcessor;
import weka.classifiers.Classifier;
import weka.classifiers.meta.Bagging;
import weka.core.Instance;
import weka.core.Utils;
import weka.core.WekaException;
import weka.wrapper.Dataset;
import weka.wrapper.Decider;
import weka.wrapper.DeciderBuilder;

/* loaded from: input_file:org/wikipedia/miner/annotation/Disambiguator.class */
public class Disambiguator {
    private Wikipedia wikipedia;
    private ArticleCleaner cleaner;
    private TextProcessor tp;
    private ArticleComparer comparer;
    private NGrammer nGrammer;
    private double minSenseProbability;
    private double minLinkProbability;
    private int maxContextSize;
    private Decider<Attributes, Boolean> decider;
    private Dataset<Attributes, Boolean> dataset;
    private int maxLabelLength = 20;
    private int sensesConsidered = 0;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/wikipedia/miner/annotation/Disambiguator$Attributes.class */
    public enum Attributes {
        commonness,
        relatedness,
        contextQuality
    }

    public Disambiguator(Wikipedia wikipedia) throws IOException, Exception {
        WikipediaConfiguration config = wikipedia.getConfig();
        init(wikipedia, new ArticleComparer(wikipedia), config.getDefaultTextProcessor(), config.getMinSenseProbability(), config.getMinLinkProbability(), 20);
        if (config.getTopicDisambiguationModel() != null) {
            loadClassifier(config.getTopicDisambiguationModel());
        }
    }

    public Disambiguator(Wikipedia wikipedia, ArticleComparer articleComparer, TextProcessor textProcessor, double d, double d2, int i) throws Exception {
        init(wikipedia, articleComparer, textProcessor, d, d2, i);
    }

    private void weightTrainingInstances() {
        double d = 0.0d;
        double d2 = 0.0d;
        Enumeration enumerateInstances = this.dataset.enumerateInstances();
        while (enumerateInstances.hasMoreElements()) {
            if (((Instance) enumerateInstances.nextElement()).value(3) == 0.0d) {
                d += 1.0d;
            } else {
                d2 += 1.0d;
            }
        }
        double d3 = d / (d + d2);
        Enumeration enumerateInstances2 = this.dataset.enumerateInstances();
        while (enumerateInstances2.hasMoreElements()) {
            Instance instance = (Instance) enumerateInstances2.nextElement();
            if (instance.value(3) == 0.0d) {
                instance.setWeight(0.5d * (1.0d / d3));
            } else {
                instance.setWeight(0.5d * (1.0d / (1.0d - d3)));
            }
        }
    }

    private void init(Wikipedia wikipedia, ArticleComparer articleComparer, TextProcessor textProcessor, double d, double d2, int i) throws Exception {
        this.wikipedia = wikipedia;
        this.comparer = articleComparer;
        this.cleaner = new ArticleCleaner();
        this.tp = textProcessor;
        this.nGrammer = new NGrammer(wikipedia.getConfig().getSentenceDetector(), wikipedia.getConfig().getTokenizer());
        this.nGrammer.setMaxN(this.maxLabelLength);
        this.minSenseProbability = d;
        this.minLinkProbability = d2;
        this.maxContextSize = i;
        this.decider = new DeciderBuilder("LinkDisambiguator", Attributes.class).setDefaultAttributeTypeNumeric().setClassAttributeTypeBoolean("isCorrectSense").build();
        if (wikipedia.getConfig().getCachePriority(WDatabase.DatabaseType.label) == null) {
            Logger.getLogger(Disambiguator.class).warn("'label' database has not been cached, so this will run significantly slower than it needs to.");
        }
        if (wikipedia.getConfig().getCachePriority(WDatabase.DatabaseType.pageLinksIn) == null) {
            Logger.getLogger(Disambiguator.class).warn("'pageLinksIn' database has not been cached, so this will run significantly slower than it needs to.");
        }
    }

    public double getProbabilityOfSense(double d, double d2, Context context) throws Exception {
        Instance build = this.decider.getInstanceBuilder().setAttribute(Attributes.commonness, Double.valueOf(d)).setAttribute(Attributes.relatedness, Double.valueOf(d2)).setAttribute(Attributes.contextQuality, Double.valueOf(context.getQuality())).build();
        this.sensesConsidered++;
        return ((Double) this.decider.getDecisionDistribution(build).get(true)).doubleValue();
    }

    public void train(ArticleSet articleSet, ArticleCleaner.SnippetLength snippetLength, String str, RelatednessCache relatednessCache) throws Exception {
        this.dataset = this.decider.createNewDataset();
        ProgressTracker progressTracker = new ProgressTracker(articleSet.size(), "training", Disambiguator.class);
        Iterator<Article> it = articleSet.iterator();
        while (it.hasNext()) {
            train(it.next(), snippetLength, relatednessCache);
            progressTracker.update();
        }
        weightTrainingInstances();
    }

    public void saveTrainingData(File file) throws IOException, Exception {
        Logger.getLogger(Disambiguator.class).info("saving training data");
        this.dataset.save(file);
    }

    public void loadTrainingData(File file) throws Exception {
        Logger.getLogger(Disambiguator.class).info("loading training data");
        this.dataset = this.decider.createNewDataset();
        this.dataset.load(file);
        weightTrainingInstances();
    }

    public void clearTrainingData() {
        this.dataset = null;
    }

    public void saveClassifier(File file) throws IOException, Exception {
        Logger.getLogger(Disambiguator.class).info("saving classifier");
        this.decider.save(file);
    }

    public void loadClassifier(File file) throws IOException, Exception {
        Logger.getLogger(Disambiguator.class).info("loading classifier");
        this.decider.load(file);
    }

    public void buildClassifier(Classifier classifier) throws Exception {
        Logger.getLogger(Disambiguator.class).info("building classifier");
        this.decider.train(classifier, this.dataset);
    }

    public void buildDefaultClassifier() throws Exception {
        Bagging bagging = new Bagging();
        bagging.setOptions(Utils.splitOptions("-P 10 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -U -M 2"));
        this.decider.train(bagging, this.dataset);
    }

    public ArticleComparer getArticleComparer() {
        return this.comparer;
    }

    private void train(Article article, ArticleCleaner.SnippetLength snippetLength, RelatednessCache relatednessCache) throws Exception {
        Vector vector = new Vector();
        Vector vector2 = new Vector();
        String markupLinksOnly = this.cleaner.getMarkupLinksOnly(article, snippetLength);
        Matcher matcher = Pattern.compile("\\[\\[(.*?)\\]\\]").matcher(markupLinksOnly);
        while (matcher.find()) {
            String substring = markupLinksOnly.substring(matcher.start() + 2, matcher.end() - 2);
            String str = substring;
            String str2 = substring;
            int lastIndexOf = substring.lastIndexOf(124);
            if (lastIndexOf > 0) {
                str2 = substring.substring(0, lastIndexOf);
                str = substring.substring(lastIndexOf + 1);
            }
            Label label = new Label(this.wikipedia.getEnvironment(), str, this.tp);
            Label.Sense[] senses = label.getSenses();
            Article articleByTitle = this.wikipedia.getArticleByTitle(str2);
            if (articleByTitle != null && senses.length >= 1) {
                TopicReference topicReference = new TopicReference(label, articleByTitle.getId(), new Position(0, 0));
                if (senses.length == 1 || senses[0].getPriorProbability() >= 1.0d - this.minSenseProbability) {
                    vector.add(label);
                } else {
                    vector2.add(topicReference);
                }
            }
        }
        Context context = getContext(article, snippetLength, relatednessCache);
        Iterator it = vector2.iterator();
        while (it.hasNext()) {
            TopicReference topicReference2 = (TopicReference) it.next();
            for (Label.Sense sense : topicReference2.getLabel().getSenses()) {
                if (sense.getPriorProbability() < this.minSenseProbability) {
                    break;
                }
                this.dataset.add(this.decider.getInstanceBuilder().setAttribute(Attributes.commonness, Double.valueOf(sense.getPriorProbability())).setAttribute(Attributes.relatedness, Double.valueOf(context.getRelatednessTo(sense))).setAttribute(Attributes.contextQuality, Double.valueOf(context.getQuality())).setClassAttribute(Boolean.valueOf(sense.getId() == topicReference2.getTopicId().intValue())).build());
            }
        }
    }

    public Result<Integer> test(ArticleSet articleSet, Wikipedia wikipedia, ArticleCleaner.SnippetLength snippetLength, RelatednessCache relatednessCache) throws SQLException, Exception {
        if (wikipedia == null) {
            Wikipedia wikipedia2 = this.wikipedia;
        }
        if (!this.decider.isReady()) {
            throw new WekaException("You must build (or load) classifier first.");
        }
        Result<Integer> result = new Result<>();
        double d = 1.0d;
        double d2 = 1.0d;
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        ProgressTracker progressTracker = new ProgressTracker(articleSet.size(), "Testing", Disambiguator.class);
        Iterator<Article> it = articleSet.iterator();
        while (it.hasNext()) {
            i++;
            Result<Integer> test = test(it.next(), snippetLength, relatednessCache);
            if (test.getRecall() == 1.0d) {
                i2++;
            }
            if (test.getPrecision() == 1.0d) {
                i3++;
            }
            d = Math.min(d, test.getRecall());
            d2 = Math.min(d2, test.getPrecision());
            result.addIntermediateResult(test);
            progressTracker.update();
        }
        System.out.println("worstR:" + d + ", worstP:" + d2);
        System.out.println("tested:" + i + ", perfectR:" + i2 + ", perfectP:" + i3);
        return result;
    }

    private Result<Integer> test(Article article, ArticleCleaner.SnippetLength snippetLength, RelatednessCache relatednessCache) throws Exception {
        System.out.println(" - testing " + article);
        Vector vector = new Vector();
        Vector vector2 = new Vector();
        String markupLinksOnly = this.cleaner.getMarkupLinksOnly(article, snippetLength);
        Matcher matcher = Pattern.compile("\\[\\[(.*?)\\]\\]").matcher(markupLinksOnly);
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        while (matcher.find()) {
            String substring = markupLinksOnly.substring(matcher.start() + 2, matcher.end() - 2);
            String str = substring;
            String str2 = substring;
            int lastIndexOf = substring.lastIndexOf(124);
            if (lastIndexOf > 0) {
                str2 = substring.substring(0, lastIndexOf);
                str = substring.substring(lastIndexOf + 1);
            }
            String str3 = Character.toUpperCase(str2.charAt(0)) + str2.substring(1);
            Label label = new Label(this.wikipedia.getEnvironment(), str, this.tp);
            Label.Sense[] senses = label.getSenses();
            Article articleByTitle = this.wikipedia.getArticleByTitle(str3);
            if (senses.length > 0 && articleByTitle != null) {
                hashSet.add(Integer.valueOf(articleByTitle.getId()));
                if (senses.length == 1 || senses[0].getPriorProbability() >= 1.0d - this.minSenseProbability) {
                    vector.add(label);
                    hashSet2.add(Integer.valueOf(articleByTitle.getId()));
                } else {
                    vector2.add(new TopicReference(label, articleByTitle.getId(), null));
                }
            }
        }
        Context context = getContext(article, snippetLength, relatednessCache);
        Iterator it = vector2.iterator();
        while (it.hasNext()) {
            TopicReference topicReference = (TopicReference) it.next();
            TreeSet treeSet = new TreeSet();
            for (Label.Sense sense : topicReference.getLabel().getSenses()) {
                if (sense.getPriorProbability() < this.minSenseProbability) {
                    break;
                }
                double probabilityOfSense = getProbabilityOfSense(sense.getPriorProbability(), context.getRelatednessTo(sense), context);
                if (probabilityOfSense > 0.5d) {
                    Article article2 = new Article(this.wikipedia.getEnvironment(), sense.getId());
                    article2.setWeight(Double.valueOf(probabilityOfSense));
                    treeSet.add(article2);
                }
                this.sensesConsidered++;
            }
            if (!treeSet.isEmpty()) {
                hashSet2.add(Integer.valueOf(((Article) treeSet.first()).getId()));
            }
        }
        Result<Integer> result = new Result<>(hashSet2, hashSet);
        System.out.println("   " + result);
        return result;
    }

    private Context getContext(Article article, ArticleCleaner.SnippetLength snippetLength, RelatednessCache relatednessCache) throws Exception {
        String markupLinksOnly = this.cleaner.getMarkupLinksOnly(article, snippetLength);
        HashSet hashSet = new HashSet();
        Vector vector = new Vector();
        for (NGrammer.NGramSpan nGramSpan : this.nGrammer.ngramPosDetect(markupLinksOnly)) {
            Label label = this.wikipedia.getLabel(nGramSpan, markupLinksOnly);
            if (label.exists() && label.getLinkProbability() >= this.minLinkProbability && label.getLinkDocCount() >= this.wikipedia.getConfig().getMinLinksIn() && !hashSet.contains(label.getText())) {
                vector.add(label);
            }
        }
        return relatednessCache == null ? new Context(vector, new RelatednessCache(this.comparer), this.maxContextSize, getMinSenseProbability()) : new Context(vector, relatednessCache, this.maxContextSize, getMinSenseProbability());
    }

    public int getMaxLabelLength() {
        return this.maxLabelLength;
    }

    public double getMinLinkProbability() {
        return this.minLinkProbability;
    }

    public void setMinLinkProbability(double d) {
        this.minLinkProbability = d;
    }

    public double getMinSenseProbability() {
        return this.minSenseProbability;
    }

    public void setMinSenseProbability(double d) {
        this.minSenseProbability = d;
    }

    public int getMaxContextSize() {
        return this.maxContextSize;
    }

    public TextProcessor getTextProcessor() {
        return this.tp;
    }

    public int getSensesConsidered() {
        return this.sensesConsidered;
    }
}
