package org.wikipedia.miner.extraction;

import gnu.trove.TObjectIntHashMap;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Vector;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.hadoop.record.CsvRecordOutput;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.log4j.Logger;
import org.wikipedia.miner.db.struct.DbIntList;
import org.wikipedia.miner.extraction.DumpExtractor;
import org.wikipedia.miner.extraction.PageStep;
import org.wikipedia.miner.model.Page;

/* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep.class */
public class RedirectStep extends Configured implements Tool {

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep$Output.class */
    public enum Output {
        redirectTargetsBySource,
        redirectSourcesByTarget
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep$RedirectOutputFormat.class */
    private static class RedirectOutputFormat extends FileOutputFormat<IntWritable, DbIntList> {

        /* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep$RedirectOutputFormat$RedirectRecordWriter.class */
        protected static class RedirectRecordWriter implements RecordWriter<IntWritable, DbIntList> {
            protected DataOutputStream outStream;

            public RedirectRecordWriter(DataOutputStream dataOutputStream) {
                this.outStream = dataOutputStream;
            }

            public synchronized void write(IntWritable intWritable, DbIntList dbIntList) throws IOException {
                ArrayList<Integer> values = dbIntList.getValues();
                Collections.sort(values);
                CsvRecordOutput csvRecordOutput = new CsvRecordOutput(this.outStream);
                csvRecordOutput.writeInt(intWritable.get(), "target");
                csvRecordOutput.startVector(values, "sources");
                Iterator<Integer> it = values.iterator();
                while (it.hasNext()) {
                    csvRecordOutput.writeInt(it.next().intValue(), "link");
                }
                csvRecordOutput.endVector(values, "sources");
                this.outStream.write(10);
            }

            public synchronized void close(Reporter reporter) throws IOException {
                this.outStream.close();
            }
        }

        private RedirectOutputFormat() {
        }

        public RecordWriter<IntWritable, DbIntList> getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
            Path taskOutputPath = FileOutputFormat.getTaskOutputPath(jobConf, str.replace("part", Output.redirectSourcesByTarget.name()));
            return new RedirectRecordWriter(taskOutputPath.getFileSystem(jobConf).create(taskOutputPath, progressable));
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep$Step2Mapper.class */
    private static class Step2Mapper extends MapReduceBase implements Mapper<LongWritable, Text, IntWritable, DbIntList> {
        private LanguageConfiguration lc;
        private SiteInfo si;
        private MultipleOutputs mos;
        Vector<Path> pageFiles = new Vector<>();
        private TObjectIntHashMap<String> articlesByTitle = null;

        private Step2Mapper() {
        }

        public void configure(JobConf jobConf) {
            HashSet hashSet = new HashSet();
            hashSet.add(Page.PageType.article);
            hashSet.add(Page.PageType.redirect);
            hashSet.add(Page.PageType.disambiguation);
            try {
                for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                    if (path.getName().equals(new Path("final/siteInfo.xml").getName())) {
                        this.si = new SiteInfo(path);
                    }
                    if (path.getName().equals(new Path(jobConf.get("wm.langFile")).getName())) {
                        this.lc = new LanguageConfiguration(jobConf.get("wm.langCode"), path);
                    }
                    if (path.getName().startsWith(PageStep.Output.tempPage.name())) {
                        Logger.getLogger(Step2Mapper.class).info("Located cached page file " + path.toString());
                        this.pageFiles.add(path);
                    }
                }
                if (this.si == null) {
                    throw new Exception("Could not locate 'final/siteInfo.xml' in DistributedCache");
                }
                if (this.lc == null) {
                    throw new Exception("Could not locate '" + jobConf.get("wm.langFile") + "' in DistributedCache");
                }
                if (this.pageFiles.isEmpty()) {
                    throw new Exception("Could not gather page summary files produced in step 1");
                }
                this.mos = new MultipleOutputs(jobConf);
            } catch (Exception e) {
                Logger.getLogger(Step2Mapper.class).error("Could not configure mapper", e);
                System.exit(1);
            }
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<IntWritable, DbIntList> outputCollector, Reporter reporter) throws IOException {
            try {
                if (this.articlesByTitle == null) {
                    HashSet hashSet = new HashSet();
                    hashSet.add(Page.PageType.article);
                    hashSet.add(Page.PageType.redirect);
                    hashSet.add(Page.PageType.disambiguation);
                    this.articlesByTitle = new TObjectIntHashMap<>();
                    Iterator<Path> it = this.pageFiles.iterator();
                    while (it.hasNext()) {
                        this.articlesByTitle = Util.gatherPageIdsByTitle(it.next(), hashSet, this.articlesByTitle, reporter);
                    }
                }
                String text2 = text.toString();
                int indexOf = text2.indexOf(44);
                int parseInt = Integer.parseInt(text2.substring(0, indexOf));
                String substring = text2.substring(indexOf + 1);
                Integer targetId = Util.getTargetId(substring, this.articlesByTitle, null);
                if (targetId == null) {
                    Logger.getLogger(Step2Mapper.class).warn("Could not identify id for redirect target '" + substring + "'");
                } else {
                    this.mos.getCollector(Output.redirectTargetsBySource.name(), reporter).collect(new IntWritable(parseInt), new IntWritable(targetId.intValue()));
                    ArrayList arrayList = new ArrayList();
                    arrayList.add(Integer.valueOf(parseInt));
                    outputCollector.collect(new IntWritable(targetId.intValue()), new DbIntList(arrayList));
                }
            } catch (Exception e) {
                Logger.getLogger(Step2Mapper.class).error("Caught exception while processing redirect '" + text + "'", e);
            }
        }

        public void close() throws IOException {
            super.close();
            this.mos.close();
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<IntWritable, DbIntList>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/RedirectStep$Step2Reducer.class */
    public static class Step2Reducer extends MapReduceBase implements Reducer<IntWritable, DbIntList, IntWritable, DbIntList> {
        public void reduce(IntWritable intWritable, Iterator<DbIntList> it, OutputCollector<IntWritable, DbIntList> outputCollector, Reporter reporter) throws IOException {
            ArrayList arrayList = new ArrayList();
            while (it.hasNext()) {
                Iterator<Integer> it2 = it.next().getValues().iterator();
                while (it2.hasNext()) {
                    arrayList.add(it2.next());
                }
            }
            outputCollector.collect(intWritable, new DbIntList(arrayList));
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((IntWritable) obj, (Iterator<DbIntList>) it, (OutputCollector<IntWritable, DbIntList>) outputCollector, reporter);
        }
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = new JobConf(RedirectStep.class);
        DumpExtractor.configureJob(jobConf, strArr);
        jobConf.setJobName("WM: resolve redirects");
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(DbIntList.class);
        jobConf.setMapperClass(Step2Mapper.class);
        jobConf.setCombinerClass(Step2Reducer.class);
        jobConf.setReducerClass(Step2Reducer.class);
        jobConf.setInputFormat(TextInputFormat.class);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.page) + "/" + PageStep.Output.tempRedirect + "*")});
        jobConf.setOutputFormat(RedirectOutputFormat.class);
        FileOutputFormat.setOutputPath(jobConf, new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.redirect)));
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.outputDir") + "/final/siteInfo.xml").toUri(), jobConf);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.langFile")).toUri(), jobConf);
        for (FileStatus fileStatus : FileSystem.get(jobConf).listStatus(new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.page)))) {
            if (fileStatus.getPath().getName().startsWith(PageStep.Output.tempPage.name())) {
                Logger.getLogger(RedirectStep.class).info("Cached page file " + fileStatus.getPath());
                DistributedCache.addCacheFile(fileStatus.getPath().toUri(), jobConf);
            }
        }
        MultipleOutputs.addNamedOutput(jobConf, Output.redirectTargetsBySource.name(), TextOutputFormat.class, IntWritable.class, IntWritable.class);
        jobConf.set("mapred.textoutputformat.separator", ",");
        JobClient.runJob(jobConf);
        return 0;
    }
}
