package org.wikipedia.miner.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Random;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.WEnvironment;
import org.wikipedia.miner.model.Article;
import org.wikipedia.miner.model.Page;
import org.wikipedia.miner.model.Wikipedia;

/* loaded from: input_file:org/wikipedia/miner/util/ArticleSet.class */
public class ArticleSet extends ArrayList<Article> {
    private static final long serialVersionUID = 6142971965290887331L;
    private MarkupStripper stripper = new MarkupStripper();

    public ArticleSet() {
    }

    public ArticleSet(File file, Wikipedia wikipedia) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                return;
            }
            add((Article) wikipedia.getPageById(new Integer(readLine.split("\t")[0].trim()).intValue()));
        }
    }

    public ArticleSet(Wikipedia wikipedia, int i, Integer num, Integer num2, Double d, Double d2, Integer num3, Integer num4, Double d3, Pattern pattern, Pattern pattern2, Vector<Article> vector, ArticleSet articleSet) {
        buildFromCandidates(wikipedia, vector == null ? getRoughCandidates(wikipedia, num, num2) : vector, i, num, num2, d, d2, num3, num4, d3, pattern, pattern2, articleSet);
    }

    public ArticleSet getRandomSubset(int i) {
        if (i > size()) {
            throw new IllegalArgumentException("requested size " + i + " is larger than " + size());
        }
        Random random = new Random();
        HashSet hashSet = new HashSet();
        ArticleSet articleSet = new ArticleSet();
        while (articleSet.size() < i) {
            Article article = get(random.nextInt(size()));
            if (!hashSet.contains(Integer.valueOf(article.getId()))) {
                articleSet.add(article);
                hashSet.add(Integer.valueOf(article.getId()));
            }
        }
        Collections.sort(articleSet);
        return articleSet;
    }

    private void buildFromCandidates(Wikipedia wikipedia, Vector<Article> vector, int i, Integer num, Integer num2, Double d, Double d2, Integer num3, Integer num4, Double d3, Pattern pattern, Pattern pattern2, ArticleSet articleSet) {
        DecimalFormat decimalFormat = new DecimalFormat("#0.00 %");
        int size = vector.size();
        ProgressTracker progressTracker = new ProgressTracker(size, "Refining candidates (ETA is worst case)", ArticleSet.class);
        double d4 = 0.0d;
        while (vector.size() > 0) {
            progressTracker.update();
            if (size() == i) {
                break;
            }
            Integer valueOf = Integer.valueOf((int) Math.floor(Math.random() * vector.size()));
            Article elementAt = vector.elementAt(valueOf.intValue());
            vector.removeElementAt(valueOf.intValue());
            if (isArticleValid(elementAt, d, d2, num3, num4, d3, pattern, pattern2, articleSet)) {
                add(elementAt);
            }
            double size2 = 1.0d - (vector.size() / size);
            if (size2 >= d4 + 0.01d) {
                double size3 = size() / i;
                if (size2 > size3) {
                    System.err.println("ArticleSet | Warning : we have exhausted " + decimalFormat.format(size2) + " of the available pages and only gathered " + decimalFormat.format(size3 * 100.0d) + " of the articles needed.");
                    d4 = size2;
                }
            }
        }
        if (size() < i) {
            System.err.println("ArticleSet | Warning: we could only find " + size() + " suitable articles.");
        }
        Collections.sort(this);
    }

    public void save(File file) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file));
        Iterator<Article> it = iterator();
        while (it.hasNext()) {
            bufferedWriter.write(it.next().getId() + "\n");
        }
        bufferedWriter.close();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static Vector<Article> getRoughCandidates(Wikipedia wikipedia, Integer num, Integer num2) {
        Vector<Article> vector = new Vector<>();
        ProgressTracker progressTracker = new ProgressTracker(wikipedia.getEnvironment().retrieveStatistic(WEnvironment.StatisticName.articleCount).intValue(), "Gathering rough candidates", ArticleSet.class);
        PageIterator pageIterator = wikipedia.getPageIterator(Page.PageType.article);
        while (pageIterator.hasNext()) {
            Article article = (Article) pageIterator.next();
            progressTracker.update();
            if (num2 == null || article.getLinksOut().length >= num2.intValue()) {
                if (num == null || article.getLinksIn().length >= num.intValue()) {
                    vector.add(article);
                }
            }
        }
        pageIterator.close();
        return vector;
    }

    @Override // java.util.ArrayList, java.util.AbstractCollection, java.util.Collection, java.util.List
    public boolean contains(Object obj) {
        return Collections.binarySearch(this, (Article) obj) >= 0;
    }

    private boolean isArticleValid(Article article, Double d, Double d2, Integer num, Integer num2, Double d3, Pattern pattern, Pattern pattern2, ArticleSet articleSet) {
        Logger.getLogger(ArticleSet.class).debug("Evaluating " + article);
        if (article.getType() == Page.PageType.disambiguation) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected due to disambiguation");
            return false;
        }
        if (articleSet != null && articleSet.contains(article)) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected due to exclusion list");
            return false;
        }
        if (d == null && d2 == null && num == null && num2 == null && d3 == null) {
            return true;
        }
        String markup = article.getMarkup();
        if (markup == null) {
            return false;
        }
        if (pattern != null && !pattern.matcher(markup).find()) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustMatch pattern");
            return false;
        }
        if (pattern2 != null && pattern2.matcher(markup).find()) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected due to mustNotMatch pattern");
            return false;
        }
        String stripExcessNewlines = this.stripper.stripExcessNewlines(this.stripper.stripToPlainText(markup, null));
        if (d3 != null) {
            int i = 0;
            int i2 = 0;
            for (String str : stripExcessNewlines.split("\n")) {
                String trim = str.replace(':', ' ').replace(';', ' ').trim();
                if (trim.length() > 5) {
                    i++;
                    if (trim.startsWith("*") || trim.startsWith("#")) {
                        i2++;
                    }
                }
            }
            float f = i2 / i;
            if (f > d3.doubleValue()) {
                Logger.getLogger(ArticleSet.class).debug(" - rejected for max list proportion " + f);
                return false;
            }
        }
        if (num == null && num2 == null && d == null && d2 == null) {
            return true;
        }
        int countTokens = new StringTokenizer(stripExcessNewlines).countTokens();
        if (num != null && countTokens < num.intValue()) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected for min wordcount " + countTokens);
            return false;
        }
        if (num2 != null && countTokens > num2.intValue()) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected for max wordcount " + countTokens);
            return false;
        }
        float totalLinksOutCount = article.getTotalLinksOutCount() / countTokens;
        if (d != null && totalLinksOutCount < d.doubleValue()) {
            Logger.getLogger(ArticleSet.class).debug(" - rejected for min link proportion " + totalLinksOutCount);
            return false;
        }
        if (d2 == null || totalLinksOutCount <= d2.doubleValue()) {
            return true;
        }
        Logger.getLogger(ArticleSet.class).debug(" - rejected for max link proportion " + totalLinksOutCount);
        return false;
    }
}
