package org.wikipedia.miner.extraction;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.hadoop.record.CsvRecordOutput;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.log4j.Logger;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument;
import org.wikipedia.miner.db.struct.DbPage;
import org.wikipedia.miner.extraction.DumpExtractor;
import org.wikipedia.miner.model.Page;

/* loaded from: input_file:org/wikipedia/miner/extraction/PageStep.class */
public class PageStep extends Configured implements Tool {
    protected Counters counters;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: org.wikipedia.miner.extraction.PageStep$1, reason: invalid class name */
    /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$1.class */
    public static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$org$wikipedia$miner$model$Page$PageType = new int[Page.PageType.values().length];

        static {
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.article.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.category.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.disambiguation.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.redirect.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$Counter.class */
    public enum Counter {
        articleCount,
        categoryCount,
        disambiguationCount,
        redirectCount,
        rootCategoryId,
        rootCategoryCount
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$Output.class */
    public enum Output {
        tempPage,
        tempRedirect,
        tempRootCategory,
        tempEditDates
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$PageOutputFormat.class */
    private static class PageOutputFormat extends FileOutputFormat<IntWritable, DbPage> {

        /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$PageOutputFormat$PageRecordWriter.class */
        protected static class PageRecordWriter implements RecordWriter<IntWritable, DbPage> {
            protected DataOutputStream outStream;

            public PageRecordWriter(DataOutputStream dataOutputStream) {
                this.outStream = dataOutputStream;
            }

            public synchronized void write(IntWritable intWritable, DbPage dbPage) throws IOException {
                CsvRecordOutput csvRecordOutput = new CsvRecordOutput(this.outStream);
                csvRecordOutput.writeInt(intWritable.get(), "id");
                dbPage.serialize(csvRecordOutput);
            }

            public synchronized void close(Reporter reporter) throws IOException {
                this.outStream.close();
            }
        }

        private PageOutputFormat() {
        }

        public RecordWriter<IntWritable, DbPage> getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
            Path taskOutputPath = FileOutputFormat.getTaskOutputPath(jobConf, str.replace("part", Output.tempPage.name()));
            return new PageRecordWriter(taskOutputPath.getFileSystem(jobConf).create(taskOutputPath, progressable));
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/PageStep$Step1Mapper.class */
    private static class Step1Mapper extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, DbPage> {
        private LanguageConfiguration lc;
        private DumpPageParser dpp;
        private MultipleOutputs mos;

        private Step1Mapper() {
        }

        public void configure(JobConf jobConf) {
            try {
                this.lc = null;
                SiteInfo siteInfo = null;
                for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                    if (path.getName().equals(new Path("final/siteInfo.xml").getName())) {
                        siteInfo = new SiteInfo(path);
                    }
                    if (path.getName().equals(new Path(jobConf.get("wm.langFile")).getName())) {
                        this.lc = new LanguageConfiguration(jobConf.get("wm.langCode"), path);
                    }
                }
                if (siteInfo == null) {
                    throw new Exception("Could not locate 'final/siteInfo.xml' in DistributedCache");
                }
                if (this.lc == null) {
                    throw new Exception("Could not locate '" + jobConf.get("wm.langFile") + "' in DistributedCache");
                }
                this.dpp = new DumpPageParser(this.lc, siteInfo);
                this.mos = new MultipleOutputs(jobConf);
            } catch (Exception e) {
                Logger.getLogger(Step1Mapper.class).error("Could not configure mapper", e);
            }
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<IntWritable, DbPage> outputCollector, Reporter reporter) throws IOException {
            try {
                DumpPage parsePage = this.dpp.parsePage(text.toString());
                if (parsePage != null) {
                    outputCollector.collect(new IntWritable(parsePage.getId()), new DbPage(parsePage.getTitle(), parsePage.getType().ordinal(), -1));
                    switch (AnonymousClass1.$SwitchMap$org$wikipedia$miner$model$Page$PageType[parsePage.getType().ordinal()]) {
                        case PreprocessedDocument.RegionTag.REGION_OPEN /* 1 */:
                            reporter.incrCounter(Counter.articleCount, 1L);
                            break;
                        case PreprocessedDocument.RegionTag.REGION_CLOSE /* 2 */:
                            reporter.incrCounter(Counter.categoryCount, 1L);
                            if (Util.normaliseTitle(parsePage.getTitle()).equals(Util.normaliseTitle(this.lc.getRootCategoryName()))) {
                                reporter.incrCounter(Counter.rootCategoryCount, 1L);
                                reporter.incrCounter(Counter.rootCategoryId, parsePage.getId());
                                break;
                            }
                            break;
                        case PreprocessedDocument.RegionTag.REGION_SPLIT /* 3 */:
                            reporter.incrCounter(Counter.disambiguationCount, 1L);
                            break;
                        case 4:
                            reporter.incrCounter(Counter.redirectCount, 1L);
                            this.mos.getCollector(Output.tempRedirect.name(), reporter).collect(new IntWritable(parsePage.getId()), new Text(parsePage.getTarget()));
                            break;
                    }
                    if (parsePage.getLastEdited() != null) {
                        this.mos.getCollector(Output.tempEditDates.name(), reporter).collect(new IntWritable(parsePage.getId()), new LongWritable(parsePage.getLastEdited().getTime()));
                    }
                }
            } catch (Exception e) {
                Logger.getLogger(Step1Mapper.class).error("Caught exception", e);
            }
        }

        public void close() throws IOException {
            super.close();
            this.mos.close();
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<IntWritable, DbPage>) outputCollector, reporter);
        }
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = new JobConf(PageStep.class);
        DumpExtractor.configureJob(jobConf, strArr);
        jobConf.setJobName("WM: gather pages");
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(DbPage.class);
        jobConf.setMapperClass(Step1Mapper.class);
        jobConf.setInputFormat(XmlInputFormat.class);
        jobConf.set(XmlInputFormat.START_TAG_KEY, "<page>");
        jobConf.set(XmlInputFormat.END_TAG_KEY, "</page>");
        jobConf.setOutputFormat(PageOutputFormat.class);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.outputDir") + "/final/siteInfo.xml").toUri(), jobConf);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.langFile")).toUri(), jobConf);
        FileInputFormat.setInputPaths(jobConf, jobConf.get("wm.inputDir"));
        FileOutputFormat.setOutputPath(jobConf, new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.page)));
        MultipleOutputs.addNamedOutput(jobConf, Output.tempRedirect.name(), TextOutputFormat.class, IntWritable.class, Text.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.tempRootCategory.name(), TextOutputFormat.class, IntWritable.class, Text.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.tempEditDates.name(), TextOutputFormat.class, IntWritable.class, LongWritable.class);
        jobConf.set("mapred.textoutputformat.separator", ",");
        this.counters = JobClient.runJob(jobConf).getCounters();
        return 0;
    }

    public TreeMap<String, Long> updateStats(TreeMap<String, Long> treeMap) throws Exception {
        if (this.counters.getCounter(Counter.rootCategoryCount) != 1) {
            throw new Exception("Could not identify root category");
        }
        for (Counter counter : Counter.values()) {
            if (counter != Counter.rootCategoryCount) {
                treeMap.put(counter.name(), Long.valueOf(this.counters.getCounter(counter)));
            }
        }
        return treeMap;
    }
}
