package org.wikipedia.miner.extraction;

import gnu.trove.TIntArrayList;
import gnu.trove.TIntObjectHashMap;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.record.CsvRecordInput;
import org.apache.hadoop.record.CsvRecordOutput;
import org.apache.hadoop.record.RecordInput;
import org.apache.hadoop.record.RecordOutput;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.WriterAppender;
import org.wikipedia.miner.db.struct.DbIntList;
import org.wikipedia.miner.db.struct.DbLabel;
import org.wikipedia.miner.db.struct.DbPage;
import org.wikipedia.miner.db.struct.DbSenseForLabel;
import org.wikipedia.miner.extraction.CategoryLinkSummaryStep;
import org.wikipedia.miner.extraction.LabelSensesStep;
import org.wikipedia.miner.extraction.PageLabelStep;
import org.wikipedia.miner.extraction.PageLinkSummaryStep;
import org.wikipedia.miner.extraction.PageStep;
import org.wikipedia.miner.extraction.RedirectStep;
import org.wikipedia.miner.extraction.struct.ExLabel;
import org.wikipedia.miner.extraction.struct.ExSenseForLabel;
import org.wikipedia.miner.model.Page;
import org.wikipedia.miner.util.ProgressTracker;

/* loaded from: input_file:org/wikipedia/miner/extraction/DumpExtractor.class */
public class DumpExtractor {
    private Configuration conf;
    private FileSystem dfs;
    private String[] args;
    private Path inputFile;
    private Path langFile;
    private String lang;
    private Path sentenceModel;
    private Path outputDir;
    private Path finalDir;
    private LanguageConfiguration lc;
    protected static final String KEY_INPUT_FILE = "wm.inputDir";
    protected static final String KEY_OUTPUT_DIR = "wm.outputDir";
    protected static final String KEY_LANG_FILE = "wm.langFile";
    protected static final String KEY_LANG_CODE = "wm.langCode";
    protected static final String KEY_SENTENCE_MODEL = "wm.sentenceModel";
    protected static final String LOG_ORPHANED_PAGES = "orphanedPages";
    protected static final String LOG_WEIRD_LABEL_COUNT = "wierdLabelCounts";
    protected static final String OUTPUT_SITEINFO = "final/siteInfo.xml";
    protected static final String OUTPUT_PROGRESS = "tempProgress.csv";
    protected static final String OUTPUT_TEMPSTATS = "tempStats.csv";
    protected static final String OUTPUT_STATS = "final/stats.csv";

    /* loaded from: input_file:org/wikipedia/miner/extraction/DumpExtractor$ExtractionStep.class */
    public enum ExtractionStep {
        page,
        redirect,
        labelSense,
        pageLabel,
        labelOccurrence,
        pageLink,
        categoryParent,
        articleParent,
        linkCooccurrence,
        relatedness
    }

    public DumpExtractor(String[] strArr) throws Exception {
        GenericOptionsParser genericOptionsParser = new GenericOptionsParser(strArr);
        this.conf = genericOptionsParser.getConfiguration();
        this.dfs = FileSystem.get(this.conf);
        this.args = genericOptionsParser.getRemainingArgs();
        configure();
        configureLogging();
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(new DumpExtractor(strArr).run());
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static JobConf configureJob(JobConf jobConf, String[] strArr) {
        jobConf.set(KEY_INPUT_FILE, strArr[0]);
        jobConf.set(KEY_LANG_FILE, strArr[1]);
        jobConf.set(KEY_LANG_CODE, strArr[2]);
        jobConf.set(KEY_SENTENCE_MODEL, strArr[3]);
        jobConf.set(KEY_OUTPUT_DIR, strArr[4]);
        jobConf.set("mapred.child.java.opts", "-Xmx3G");
        jobConf.setNumReduceTasks(1);
        return jobConf;
    }

    private void configure() throws Exception {
        if (this.args.length != 5) {
            throw new IllegalArgumentException("Please specify a xml dump of wikipedia, a language.xml config file, a language code, an openNLP sentence detection model, and a writable output directory");
        }
        this.inputFile = new Path(this.args[0]);
        FileStatus fileStatus = this.dfs.getFileStatus(this.inputFile);
        if (fileStatus.isDir() || !fileStatus.getPermission().getUserAction().implies(FsAction.READ)) {
            throw new IOException("'" + this.inputFile + " is not readable or does not exist");
        }
        this.langFile = new Path(this.args[1]);
        this.lang = this.args[2];
        this.lc = new LanguageConfiguration(this.dfs, this.lang, this.langFile);
        if (this.lc == null) {
            throw new IOException("Could not load language configuration for '" + this.lang + "' from '" + this.langFile + "'");
        }
        this.sentenceModel = new Path(this.args[3]);
        FileStatus fileStatus2 = this.dfs.getFileStatus(this.sentenceModel);
        if (fileStatus2.isDir() || !fileStatus2.getPermission().getUserAction().implies(FsAction.READ)) {
            throw new IOException("'" + this.sentenceModel + " is not readable or does not exist");
        }
        this.outputDir = new Path(this.args[4]);
        FileStatus fileStatus3 = this.dfs.getFileStatus(this.outputDir);
        if (!fileStatus3.isDir() || !fileStatus3.getPermission().getUserAction().implies(FsAction.WRITE)) {
            throw new IOException("'" + this.outputDir + " is not a writable directory");
        }
        this.finalDir = new Path(this.outputDir + "/final");
        this.dfs.mkdirs(this.finalDir);
    }

    private void configureLogging() throws IOException {
        Path path = new Path(this.outputDir + "/logs");
        this.dfs.mkdirs(path);
        Logger logger = Logger.getLogger(LOG_ORPHANED_PAGES);
        logger.setAdditivity(false);
        logger.addAppender(new WriterAppender(new PatternLayout("%-5p: %m%n"), new OutputStreamWriter(this.dfs.create(new Path(path + "/" + LOG_ORPHANED_PAGES + ".log")))));
        Logger logger2 = Logger.getLogger(LOG_WEIRD_LABEL_COUNT);
        logger2.setAdditivity(false);
        logger2.addAppender(new WriterAppender(new PatternLayout("%-5p: %m%n"), new OutputStreamWriter(this.dfs.create(new Path(path + "/" + LOG_WEIRD_LABEL_COUNT + ".log")))));
    }

    private int run() throws Exception {
        Logger.getLogger(DumpExtractor.class).info("Extracting site info");
        extractSiteInfo();
        int i = 0;
        ExtractionStep readProgress = readProgress();
        TreeMap<String, Long> readStatistics = readProgress != null ? readStatistics() : new TreeMap<>();
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("HH:mm:ss");
        if (readProgress == null) {
            ExtractionStep extractionStep = ExtractionStep.page;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep)), true);
            long currentTimeMillis = System.currentTimeMillis();
            PageStep pageStep = new PageStep();
            i = ToolRunner.run(new Configuration(), pageStep, this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            readStatistics = pageStep.updateStats(readStatistics);
            readStatistics.put("lastEdit", getLastEdit());
            writeStatistics(readStatistics);
            readProgress = extractionStep;
            writeProgress(readProgress);
            System.out.println(extractionStep + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis)));
        }
        if (readProgress.compareTo(ExtractionStep.redirect) < 0) {
            ExtractionStep extractionStep2 = ExtractionStep.redirect;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep2 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep2)), true);
            long currentTimeMillis2 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new RedirectStep(), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep2 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(ExtractionStep.redirect, RedirectStep.Output.redirectSourcesByTarget.name());
            finalizeFile(ExtractionStep.redirect, RedirectStep.Output.redirectTargetsBySource.name());
            readProgress = extractionStep2;
            writeProgress(readProgress);
            System.out.println(extractionStep2 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis2)));
        }
        if (readProgress.compareTo(ExtractionStep.labelSense) < 0) {
            ExtractionStep extractionStep3 = ExtractionStep.labelSense;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep3 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep3)), true);
            long currentTimeMillis3 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new LabelSensesStep(), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep3 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(extractionStep3, LabelSensesStep.Output.sentenceSplits.name());
            finalizeFile(extractionStep3, LabelSensesStep.Output.translations.name());
            readProgress = extractionStep3;
            writeProgress(readProgress);
            System.out.println(extractionStep3 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis3)));
        }
        if (readProgress.compareTo(ExtractionStep.pageLabel) < 0) {
            ExtractionStep extractionStep4 = ExtractionStep.pageLabel;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep4 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep4)), true);
            long currentTimeMillis4 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new PageLabelStep(), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep4 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(extractionStep4, PageLabelStep.Output.pageLabel.name());
            readProgress = extractionStep4;
            writeProgress(readProgress);
            System.out.println(extractionStep4 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis4)));
        }
        if (readProgress.compareTo(ExtractionStep.labelOccurrence) < 0) {
            ExtractionStep extractionStep5 = ExtractionStep.labelOccurrence;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep5 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep5)), true);
            long currentTimeMillis5 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new LabelOccurrencesStep(), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep5 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeLabels();
            readProgress = extractionStep5;
            writeProgress(readProgress);
            System.out.println(extractionStep5 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis5)));
        }
        if (readProgress.compareTo(ExtractionStep.pageLink) < 0) {
            ExtractionStep extractionStep6 = ExtractionStep.pageLink;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep6 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep6)), true);
            long currentTimeMillis6 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new PageLinkSummaryStep(), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep6 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(extractionStep6, PageLinkSummaryStep.Output.pageLinkIn.name());
            finalizeFile(extractionStep6, PageLinkSummaryStep.Output.pageLinkOut.name());
            readProgress = extractionStep6;
            writeProgress(readProgress);
            System.out.println(extractionStep6 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis6)));
        }
        if (readProgress.compareTo(ExtractionStep.categoryParent) < 0) {
            ExtractionStep extractionStep7 = ExtractionStep.categoryParent;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep7 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep7)), true);
            long currentTimeMillis7 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new CategoryLinkSummaryStep(extractionStep7), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep7 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(extractionStep7, CategoryLinkSummaryStep.Output.categoryParents.name());
            finalizeFile(extractionStep7, CategoryLinkSummaryStep.Output.childCategories.name());
            readProgress = extractionStep7;
            writeProgress(readProgress);
            System.out.println(extractionStep7 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis7)));
        }
        if (readProgress.compareTo(ExtractionStep.articleParent) < 0) {
            ExtractionStep extractionStep8 = ExtractionStep.articleParent;
            Logger.getLogger(DumpExtractor.class).info("Starting " + extractionStep8 + " step");
            this.dfs.delete(new Path(this.outputDir + "/" + getDirectoryName(extractionStep8)), true);
            long currentTimeMillis8 = System.currentTimeMillis();
            i = ToolRunner.run(new Configuration(), new CategoryLinkSummaryStep(extractionStep8), this.args);
            if (i != 0) {
                Logger.getLogger(DumpExtractor.class).fatal("Could not complete " + extractionStep8 + " step. Check map/reduce user logs for an explanation.");
                return i;
            }
            finalizeFile(extractionStep8, CategoryLinkSummaryStep.Output.articleParents.name());
            finalizeFile(extractionStep8, CategoryLinkSummaryStep.Output.childArticles.name());
            finalizePages(readStatistics);
            finalizeStatistics(readStatistics);
            writeProgress(extractionStep8);
            System.out.println(extractionStep8 + " step completed in " + simpleDateFormat.format(Long.valueOf(System.currentTimeMillis() - currentTimeMillis8)));
        }
        return i;
    }

    private ExtractionStep readProgress() {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(new Path(this.outputDir + "/" + OUTPUT_PROGRESS))));
            int read = bufferedReader.read();
            bufferedReader.close();
            return ExtractionStep.values()[read];
        } catch (IOException e) {
            return null;
        }
    }

    private void writeProgress(ExtractionStep extractionStep) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.outputDir + "/" + OUTPUT_PROGRESS))));
        bufferedWriter.write(extractionStep.ordinal());
        bufferedWriter.close();
    }

    private TreeMap<String, Long> readStatistics() throws IOException {
        TreeMap<String, Long> treeMap = new TreeMap<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(new Path(this.outputDir + "/" + OUTPUT_TEMPSTATS))));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                CsvRecordInput csvRecordInput = new CsvRecordInput(new ByteArrayInputStream((readLine + "\n").getBytes("UTF-8")));
                treeMap.put(csvRecordInput.readString((String) null), Long.valueOf(csvRecordInput.readLong((String) null)));
            }
            bufferedReader.close();
        } catch (IOException e) {
        }
        return treeMap;
    }

    private void writeStatistics(TreeMap<String, Long> treeMap) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.outputDir + "/" + OUTPUT_TEMPSTATS))));
        for (Map.Entry<String, Long> entry : treeMap.entrySet()) {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            CsvRecordOutput csvRecordOutput = new CsvRecordOutput(byteArrayOutputStream);
            csvRecordOutput.writeString(entry.getKey(), (String) null);
            csvRecordOutput.writeLong(entry.getValue().longValue(), (String) null);
            bufferedWriter.write(byteArrayOutputStream.toString("UTF-8"));
            bufferedWriter.newLine();
        }
        bufferedWriter.close();
    }

    private HashMap<Integer, Short> calculatePageDepths(TreeMap<String, Long> treeMap, TIntObjectHashMap<TIntArrayList> tIntObjectHashMap, TIntObjectHashMap<TIntArrayList> tIntObjectHashMap2) {
        HashMap<Integer, Short> hashMap = new HashMap<>();
        Short sh = (short) 0;
        Integer valueOf = Integer.valueOf(treeMap.get(PageStep.Counter.rootCategoryId.name()).intValue());
        Vector vector = new Vector();
        Vector vector2 = new Vector();
        while (valueOf != null) {
            if (!hashMap.containsKey(valueOf)) {
                hashMap.put(valueOf, sh);
                TIntArrayList tIntArrayList = (TIntArrayList) tIntObjectHashMap2.get(valueOf.intValue());
                if (tIntArrayList != null) {
                    for (int i = 0; i < tIntArrayList.size(); i++) {
                        Integer valueOf2 = Integer.valueOf(tIntArrayList.get(i));
                        if (!hashMap.containsKey(valueOf2)) {
                            hashMap.put(valueOf2, Short.valueOf((short) (sh.shortValue() + 1)));
                        }
                    }
                }
                TIntArrayList tIntArrayList2 = (TIntArrayList) tIntObjectHashMap.get(valueOf.intValue());
                if (tIntArrayList2 != null) {
                    for (int i2 = 0; i2 < tIntArrayList2.size(); i2++) {
                        Integer valueOf3 = Integer.valueOf(tIntArrayList2.get(i2));
                        if (!hashMap.containsKey(valueOf3)) {
                            vector2.add(valueOf3);
                        }
                    }
                }
            }
            if (vector.isEmpty()) {
                vector = vector2;
                vector2 = new Vector();
                sh = Short.valueOf((short) (sh.shortValue() + 1));
            }
            if (vector.isEmpty()) {
                valueOf = null;
            } else {
                valueOf = (Integer) vector.firstElement();
                vector.removeElementAt(0);
            }
        }
        treeMap.put("maxCategoryDepth", Long.valueOf(sh.shortValue()));
        return hashMap;
    }

    private TIntObjectHashMap<TIntArrayList> gatherChildren(ExtractionStep extractionStep, final String str) throws IOException {
        TIntObjectHashMap<TIntArrayList> tIntObjectHashMap = new TIntObjectHashMap<>();
        for (FileStatus fileStatus : this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(extractionStep)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.1
            public boolean accept(Path path) {
                return path.getName().startsWith(str);
            }
        })) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(fileStatus.getPath())));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    RecordInput csvRecordInput = new CsvRecordInput(new ByteArrayInputStream((readLine + "\n").getBytes("UTF-8")));
                    int readInt = csvRecordInput.readInt("parent");
                    DbIntList dbIntList = new DbIntList();
                    dbIntList.deserialize(csvRecordInput);
                    if (dbIntList.getValues() != null && !dbIntList.getValues().isEmpty()) {
                        TIntArrayList tIntArrayList = new TIntArrayList();
                        Iterator<Integer> it = dbIntList.getValues().iterator();
                        while (it.hasNext()) {
                            tIntArrayList.add(it.next().intValue());
                        }
                        tIntObjectHashMap.put(readInt, tIntArrayList);
                    }
                }
            }
        }
        return tIntObjectHashMap;
    }

    private void extractSiteInfo() throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(this.inputFile)));
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.outputDir + "/" + OUTPUT_SITEINFO))));
        boolean z = false;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            if (!z && readLine.matches("\\s*\\<siteinfo\\>\\s*")) {
                z = true;
            }
            if (z) {
                bufferedWriter.write(readLine);
                bufferedWriter.newLine();
                if (readLine.matches("\\s*\\<\\/siteinfo\\>\\s*")) {
                    break;
                }
            }
        }
        bufferedReader.close();
        bufferedWriter.close();
    }

    private void finalizePages(TreeMap<String, Long> treeMap) throws IOException {
        HashMap<Integer, Short> calculatePageDepths = calculatePageDepths(treeMap, gatherChildren(ExtractionStep.categoryParent, CategoryLinkSummaryStep.Output.childCategories.name()), gatherChildren(ExtractionStep.articleParent, CategoryLinkSummaryStep.Output.childArticles.name()));
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.finalDir + "/page.csv"))));
        for (FileStatus fileStatus : this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(ExtractionStep.page)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.2
            public boolean accept(Path path) {
                return path.getName().startsWith(PageStep.Output.tempPage.name());
            }
        })) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(fileStatus.getPath())));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    RecordInput csvRecordInput = new CsvRecordInput(new ByteArrayInputStream((readLine + "\n").getBytes("UTF-8")));
                    int readInt = csvRecordInput.readInt("id");
                    DbPage dbPage = new DbPage();
                    dbPage.deserialize(csvRecordInput);
                    Page.PageType pageType = Page.PageType.values()[dbPage.getType()];
                    Short sh = calculatePageDepths.get(Integer.valueOf(readInt));
                    if (sh != null) {
                        dbPage.setDepth(sh.intValue());
                    } else if (pageType != Page.PageType.redirect) {
                        Logger.getLogger(LOG_ORPHANED_PAGES).warn("Could not identify depth of page " + readInt + ":" + dbPage.getTitle() + "[" + pageType + "]");
                    }
                    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                    CsvRecordOutput csvRecordOutput = new CsvRecordOutput(byteArrayOutputStream);
                    csvRecordOutput.writeInt(readInt, "id");
                    dbPage.serialize(csvRecordOutput);
                    bufferedWriter.write(byteArrayOutputStream.toString("UTF-8"));
                }
            }
            bufferedReader.close();
        }
        bufferedWriter.close();
    }

    private void finalizeLabels() throws IOException {
        String nextLine;
        String nextLine2;
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.finalDir + "/label.csv"))));
        FileStatus[] listStatus = this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(ExtractionStep.labelSense)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.3
            public boolean accept(Path path) {
                return path.getName().startsWith(LabelSensesStep.Output.tempLabel.name());
            }
        });
        FileStatus[] listStatus2 = this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(ExtractionStep.labelOccurrence)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.4
            public boolean accept(Path path) {
                return path.getName().startsWith(LabelSensesStep.Output.tempLabel.name());
            }
        });
        long j = 0;
        for (FileStatus fileStatus : listStatus) {
            j += fileStatus.getLen();
        }
        for (FileStatus fileStatus2 : listStatus) {
            j += fileStatus2.getLen();
        }
        ProgressTracker progressTracker = new ProgressTracker(j, "Finalizing labels", DumpExtractor.class);
        long[] jArr = {0};
        int[] iArr = {0};
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(listStatus[iArr[0]].getPath())));
        int[] iArr2 = {0};
        BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(this.dfs.open(listStatus2[iArr2[0]].getPath())));
        String str = null;
        String str2 = null;
        ExLabel exLabel = null;
        ExLabel exLabel2 = null;
        while (true) {
            if (str == null && iArr[0] < listStatus.length && (nextLine2 = getNextLine(bufferedReader, listStatus, iArr, jArr)) != null) {
                CsvRecordInput csvRecordInput = new CsvRecordInput(new ByteArrayInputStream((nextLine2 + "\n").getBytes("UTF-8")));
                str = csvRecordInput.readString("labelText");
                exLabel = new ExLabel();
                exLabel.deserialize(csvRecordInput);
            }
            if (str2 == null && iArr2[0] < listStatus2.length && (nextLine = getNextLine(bufferedReader2, listStatus2, iArr2, jArr)) != null) {
                CsvRecordInput csvRecordInput2 = new CsvRecordInput(new ByteArrayInputStream((nextLine + "\n").getBytes("UTF-8")));
                str2 = csvRecordInput2.readString("labelText");
                exLabel2 = new ExLabel();
                exLabel2.deserialize(csvRecordInput2);
            }
            if (str == null && str2 == null) {
                bufferedWriter.close();
                return;
            }
            if (str == null || str2 == null || !str.equals(str2)) {
                if (str == null || (str2 != null && str.compareTo(str2) >= 0)) {
                    Logger.getLogger(LOG_WEIRD_LABEL_COUNT).error("Found label '" + str2 + "' without any senses or link occurances.");
                    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                    RecordOutput csvRecordOutput = new CsvRecordOutput(byteArrayOutputStream);
                    csvRecordOutput.writeString(str2, "labelText");
                    convert(exLabel2).serialize(csvRecordOutput);
                    bufferedWriter.write(byteArrayOutputStream.toString("UTF-8"));
                    exLabel2 = null;
                    str2 = null;
                } else {
                    if (exLabel.getLinkOccCount() > 0) {
                        Logger.getLogger(LOG_WEIRD_LABEL_COUNT).warn("Found label '" + str + "' without any text occurances. It occurs in " + exLabel.getLinkOccCount() + " links.");
                    }
                    ByteArrayOutputStream byteArrayOutputStream2 = new ByteArrayOutputStream();
                    RecordOutput csvRecordOutput2 = new CsvRecordOutput(byteArrayOutputStream2);
                    csvRecordOutput2.writeString(str, "labelText");
                    convert(exLabel).serialize(csvRecordOutput2);
                    bufferedWriter.write(byteArrayOutputStream2.toString("UTF-8"));
                    exLabel = null;
                    str = null;
                }
                progressTracker.update(jArr[0]);
            } else {
                exLabel.setTextDocCount(exLabel2.getTextDocCount());
                exLabel.setTextOccCount(exLabel2.getTextOccCount());
                if (exLabel.getLinkOccCount() > exLabel.getTextOccCount()) {
                    Logger.getLogger(LOG_WEIRD_LABEL_COUNT).warn("Label '" + str + "' occurs " + exLabel.getLinkOccCount() + " times as links, but only " + exLabel.getTextOccCount() + " times in plain text.");
                }
                if (exLabel.getLinkDocCount() > exLabel.getTextDocCount()) {
                    Logger.getLogger(LOG_WEIRD_LABEL_COUNT).warn("Label '" + str + "' occurs in " + exLabel.getLinkDocCount() + " documents as links, but only " + exLabel.getTextDocCount() + " in plain text.");
                }
                ByteArrayOutputStream byteArrayOutputStream3 = new ByteArrayOutputStream();
                RecordOutput csvRecordOutput3 = new CsvRecordOutput(byteArrayOutputStream3);
                csvRecordOutput3.writeString(str, "labelText");
                convert(exLabel).serialize(csvRecordOutput3);
                bufferedWriter.write(byteArrayOutputStream3.toString("UTF-8"));
                exLabel = null;
                str = null;
                exLabel2 = null;
                str2 = null;
            }
        }
    }

    private void finalizeStatistics(TreeMap<String, Long> treeMap) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.outputDir + "/" + OUTPUT_STATS))));
        for (Map.Entry<String, Long> entry : treeMap.entrySet()) {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            CsvRecordOutput csvRecordOutput = new CsvRecordOutput(byteArrayOutputStream);
            csvRecordOutput.writeString(entry.getKey(), (String) null);
            csvRecordOutput.writeLong(entry.getValue().longValue(), (String) null);
            bufferedWriter.write(byteArrayOutputStream.toString("UTF-8"));
            bufferedWriter.newLine();
        }
        bufferedWriter.close();
    }

    private void finalizeFile(ExtractionStep extractionStep, final String str) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dfs.create(new Path(this.finalDir + "/" + str + ".csv"))));
        FileStatus[] listStatus = this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(extractionStep)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.5
            public boolean accept(Path path) {
                return path.getName().startsWith(str);
            }
        });
        long j = 0;
        for (FileStatus fileStatus : listStatus) {
            j += fileStatus.getLen();
        }
        ProgressTracker progressTracker = new ProgressTracker(j, "finalizing " + str, DumpExtractor.class);
        long j2 = 0;
        for (FileStatus fileStatus2 : listStatus) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(fileStatus2.getPath())));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    j2 += readLine.length() + 1;
                    progressTracker.update(j2);
                    bufferedWriter.write(readLine);
                    bufferedWriter.newLine();
                }
            }
            bufferedReader.close();
        }
        bufferedWriter.close();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static DbLabel convert(ExLabel exLabel) {
        ArrayList<DbSenseForLabel> arrayList = new ArrayList<>();
        for (Map.Entry<Integer, ExSenseForLabel> entry : exLabel.getSensesById().entrySet()) {
            DbSenseForLabel dbSenseForLabel = new DbSenseForLabel();
            dbSenseForLabel.setId(entry.getKey().intValue());
            dbSenseForLabel.setLinkOccCount(entry.getValue().getLinkOccCount());
            dbSenseForLabel.setLinkDocCount(entry.getValue().getLinkDocCount());
            dbSenseForLabel.setFromRedirect(entry.getValue().getFromRedirect());
            dbSenseForLabel.setFromTitle(entry.getValue().getFromTitle());
            arrayList.add(dbSenseForLabel);
        }
        Collections.sort(arrayList, new Comparator<DbSenseForLabel>() { // from class: org.wikipedia.miner.extraction.DumpExtractor.6
            @Override // java.util.Comparator
            public int compare(DbSenseForLabel dbSenseForLabel2, DbSenseForLabel dbSenseForLabel3) {
                int compareTo = new Long(dbSenseForLabel3.getLinkOccCount()).compareTo(Long.valueOf(dbSenseForLabel2.getLinkOccCount()));
                if (compareTo != 0) {
                    return compareTo;
                }
                int compareTo2 = new Long(dbSenseForLabel3.getLinkDocCount()).compareTo(Long.valueOf(dbSenseForLabel2.getLinkDocCount()));
                return compareTo2 != 0 ? compareTo2 : new Integer(dbSenseForLabel2.getId()).compareTo(Integer.valueOf(dbSenseForLabel3.getId()));
            }
        });
        DbLabel dbLabel = new DbLabel();
        dbLabel.setLinkDocCount(exLabel.getLinkDocCount());
        dbLabel.setLinkOccCount(exLabel.getLinkOccCount());
        dbLabel.setTextDocCount(exLabel.getTextDocCount());
        dbLabel.setTextOccCount(exLabel.getTextOccCount());
        dbLabel.setSenses(arrayList);
        return dbLabel;
    }

    private String getNextLine(BufferedReader bufferedReader, FileStatus[] fileStatusArr, int[] iArr, long[] jArr) throws IOException {
        String readLine = bufferedReader.readLine();
        if (readLine == null) {
            iArr[0] = iArr[0] + 1;
            bufferedReader.close();
            if (iArr[0] < fileStatusArr.length) {
                readLine = new BufferedReader(new InputStreamReader(this.dfs.open(fileStatusArr[iArr[0]].getPath()))).readLine();
                jArr[0] = jArr[0] + readLine.length() + 1;
            }
        }
        return readLine;
    }

    private Long getLastEdit() throws IOException {
        Long l = null;
        for (FileStatus fileStatus : this.dfs.listStatus(new Path(this.outputDir + "/" + getDirectoryName(ExtractionStep.page)), new PathFilter() { // from class: org.wikipedia.miner.extraction.DumpExtractor.7
            public boolean accept(Path path) {
                return path.getName().startsWith(PageStep.Output.tempEditDates.name());
            }
        })) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this.dfs.open(fileStatus.getPath())));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine != null) {
                    CsvRecordInput csvRecordInput = new CsvRecordInput(new ByteArrayInputStream((readLine + "\n").getBytes("UTF-8")));
                    csvRecordInput.readInt((String) null);
                    long readLong = csvRecordInput.readLong((String) null);
                    if (l == null || l.longValue() < readLong) {
                        l = Long.valueOf(readLong);
                    }
                }
            }
        }
        return l;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String getDirectoryName(ExtractionStep extractionStep) {
        StringBuffer stringBuffer = new StringBuffer("temp");
        stringBuffer.append(Character.toUpperCase(extractionStep.name().charAt(0)));
        stringBuffer.append(extractionStep.name().substring(1));
        return stringBuffer.toString();
    }
}
