package org.wikipedia.miner.extraction;

import gnu.trove.TIntIntHashMap;
import gnu.trove.TObjectIntHashMap;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.Span;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleOutputs;
import org.apache.hadoop.record.CsvRecordOutput;
import org.apache.hadoop.record.Record;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.log4j.Logger;
import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument;
import org.wikipedia.miner.db.struct.DbLinkLocation;
import org.wikipedia.miner.db.struct.DbSentenceSplitList;
import org.wikipedia.miner.db.struct.DbTranslations;
import org.wikipedia.miner.extraction.DumpExtractor;
import org.wikipedia.miner.extraction.PageStep;
import org.wikipedia.miner.extraction.RedirectStep;
import org.wikipedia.miner.extraction.struct.ExLabel;
import org.wikipedia.miner.extraction.struct.ExSenseForLabel;
import org.wikipedia.miner.model.Page;
import org.wikipedia.miner.util.MarkupStripper;

/* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep.class */
public class LabelSensesStep extends Configured implements Tool {

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: org.wikipedia.miner.extraction.LabelSensesStep$1, reason: invalid class name */
    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$1.class */
    public static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$org$wikipedia$miner$model$Page$PageType = new int[Page.PageType.values().length];

        static {
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.article.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.disambiguation.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.category.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$org$wikipedia$miner$model$Page$PageType[Page.PageType.redirect.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$IntRecordOutputFormat.class */
    protected static class IntRecordOutputFormat extends TextOutputFormat<IntWritable, Record> {

        /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$IntRecordOutputFormat$IntRecordWriter.class */
        protected static class IntRecordWriter implements RecordWriter<IntWritable, Record> {
            protected DataOutputStream outStream;

            public IntRecordWriter(DataOutputStream dataOutputStream) {
                this.outStream = dataOutputStream;
            }

            public synchronized void write(IntWritable intWritable, Record record) throws IOException {
                CsvRecordOutput csvRecordOutput = new CsvRecordOutput(this.outStream);
                csvRecordOutput.writeInt(intWritable.get(), (String) null);
                record.serialize(csvRecordOutput);
            }

            public synchronized void close(Reporter reporter) throws IOException {
                this.outStream.close();
            }
        }

        protected IntRecordOutputFormat() {
        }

        public RecordWriter<IntWritable, Record> getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
            Path taskOutputPath = FileOutputFormat.getTaskOutputPath(jobConf, str);
            return new IntRecordWriter(taskOutputPath.getFileSystem(jobConf).create(taskOutputPath, progressable));
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$LabelOutputFormat.class */
    protected static class LabelOutputFormat extends TextOutputFormat<Text, ExLabel> {

        /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$LabelOutputFormat$LabelRecordWriter.class */
        protected static class LabelRecordWriter implements RecordWriter<Text, ExLabel> {
            protected DataOutputStream outStream;

            public LabelRecordWriter(DataOutputStream dataOutputStream) {
                this.outStream = dataOutputStream;
            }

            public synchronized void write(Text text, ExLabel exLabel) throws IOException {
                CsvRecordOutput csvRecordOutput = new CsvRecordOutput(this.outStream);
                csvRecordOutput.writeString(text.toString(), "label");
                exLabel.serialize(csvRecordOutput);
            }

            public synchronized void close(Reporter reporter) throws IOException {
                this.outStream.close();
            }
        }

        protected LabelOutputFormat() {
        }

        public RecordWriter<Text, ExLabel> getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
            Path taskOutputPath = FileOutputFormat.getTaskOutputPath(jobConf, str.replace("part", Output.tempLabel.name()));
            return new LabelRecordWriter(taskOutputPath.getFileSystem(jobConf).create(taskOutputPath, progressable));
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$LabelSensesMapper.class */
    private static class LabelSensesMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, ExLabel> {
        private LanguageConfiguration lc;
        private SiteInfo si;
        private DumpPageParser pageParser;
        private DumpLinkParser linkParser;
        private SentenceDetectorME sentenceDetector;
        private MultipleOutputs mos;
        Vector<Path> pageFiles = new Vector<>();
        private TObjectIntHashMap<String> articlesByTitle = null;
        private TObjectIntHashMap<String> categoriesByTitle = null;
        Vector<Path> redirectFiles = new Vector<>();
        private TIntIntHashMap redirectTargetsBySource = null;
        private MarkupStripper stripper = new MarkupStripper();
        Pattern paragraphSplitPattern = Pattern.compile("\n(\\s*)[\n\\:\\*\\#]");

        private LabelSensesMapper() {
        }

        public void configure(JobConf jobConf) {
            try {
                for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                    Logger.getLogger(LabelSensesMapper.class).info("Located cached file " + path.toString());
                    Logger.getLogger(LabelSensesMapper.class).info(path.getName() + " v.s " + new Path(jobConf.get("wm.sentenceModel")).getName());
                    if (path.getName().equals(new Path(jobConf.get("wm.sentenceModel")).getName())) {
                        Logger.getLogger(LabelSensesMapper.class).info("Located cached sentence model " + path.toString());
                        FileInputStream fileInputStream = new FileInputStream(new File(path.toString()));
                        SentenceModel sentenceModel = null;
                        try {
                            try {
                                sentenceModel = new SentenceModel(fileInputStream);
                                if (fileInputStream != null) {
                                    try {
                                        fileInputStream.close();
                                    } catch (IOException e) {
                                    }
                                }
                            } catch (Throwable th) {
                                if (fileInputStream != null) {
                                    try {
                                        fileInputStream.close();
                                    } catch (IOException e2) {
                                    }
                                }
                                throw th;
                            }
                        } catch (IOException e3) {
                            e3.printStackTrace();
                            if (fileInputStream != null) {
                                try {
                                    fileInputStream.close();
                                } catch (IOException e4) {
                                }
                            }
                        }
                        this.sentenceDetector = new SentenceDetectorME(sentenceModel);
                    }
                    if (path.getName().equals(new Path("final/siteInfo.xml").getName())) {
                        this.si = new SiteInfo(path);
                    }
                    if (path.getName().equals(new Path(jobConf.get("wm.langFile")).getName())) {
                        this.lc = new LanguageConfiguration(jobConf.get("wm.langCode"), path);
                    }
                    if (path.getName().startsWith(PageStep.Output.tempPage.name())) {
                        Logger.getLogger(LabelSensesMapper.class).info("Located cached page file " + path.toString());
                        this.pageFiles.add(path);
                    }
                    if (path.getName().startsWith(RedirectStep.Output.redirectTargetsBySource.name())) {
                        Logger.getLogger(LabelSensesMapper.class).info("Located cached redirect file " + path.toString());
                        this.redirectFiles.add(path);
                    }
                }
                if (this.si == null) {
                    throw new Exception("Could not locate 'final/siteInfo.xml' in DistributedCache");
                }
                if (this.lc == null) {
                    throw new Exception("Could not locate '" + jobConf.get("wm.langFile") + "' in DistributedCache");
                }
                if (this.sentenceDetector == null) {
                    throw new Exception("Could not load sentence model '" + jobConf.get("wm.sentenceModel") + "' from DistributedCache");
                }
                if (this.pageFiles.isEmpty()) {
                    throw new Exception("Could not gather page summary files produced in step 1");
                }
                if (this.redirectFiles.isEmpty()) {
                    throw new Exception("Could not gather redirect summary files produced in step 2");
                }
                this.pageParser = new DumpPageParser(this.lc, this.si);
                this.linkParser = new DumpLinkParser(this.lc, this.si);
                this.mos = new MultipleOutputs(jobConf);
            } catch (Exception e5) {
                Logger.getLogger(LabelSensesMapper.class).error("Could not configure mapper", e5);
                System.exit(1);
            }
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<Text, ExLabel> outputCollector, Reporter reporter) throws IOException {
            DumpPage dumpPage = null;
            try {
                if (this.articlesByTitle == null || this.categoriesByTitle == null) {
                    HashSet hashSet = new HashSet();
                    hashSet.add(Page.PageType.article);
                    hashSet.add(Page.PageType.redirect);
                    hashSet.add(Page.PageType.disambiguation);
                    HashSet hashSet2 = new HashSet();
                    hashSet2.add(Page.PageType.category);
                    this.articlesByTitle = new TObjectIntHashMap<>();
                    this.categoriesByTitle = new TObjectIntHashMap<>();
                    Iterator<Path> it = this.pageFiles.iterator();
                    while (it.hasNext()) {
                        Path next = it.next();
                        this.articlesByTitle = Util.gatherPageIdsByTitle(next, hashSet, this.articlesByTitle, reporter);
                        this.categoriesByTitle = Util.gatherPageIdsByTitle(next, hashSet2, this.categoriesByTitle, reporter);
                    }
                }
                if (this.redirectTargetsBySource == null) {
                    this.redirectTargetsBySource = new TIntIntHashMap();
                    Iterator<Path> it2 = this.redirectFiles.iterator();
                    while (it2.hasNext()) {
                        this.redirectTargetsBySource = Util.gatherRedirectTargetsBySource(it2.next(), this.redirectTargetsBySource, reporter);
                    }
                }
                dumpPage = this.pageParser.parsePage(text.toString());
                if (dumpPage != null) {
                    HashMap<String, ExLabel> hashMap = new HashMap<>();
                    TreeMap<Integer, ArrayList<Integer>> treeMap = new TreeMap<>();
                    TreeMap<String, String> treeMap2 = new TreeMap<>();
                    switch (AnonymousClass1.$SwitchMap$org$wikipedia$miner$model$Page$PageType[dumpPage.getType().ordinal()]) {
                        case PreprocessedDocument.RegionTag.REGION_OPEN /* 1 */:
                        case PreprocessedDocument.RegionTag.REGION_CLOSE /* 2 */:
                            ExLabel exLabel = new ExLabel(0L, 0L, 0L, 0L, new TreeMap());
                            exLabel.getSensesById().put(Integer.valueOf(dumpPage.getId()), new ExSenseForLabel(0L, 0L, true, false));
                            hashMap.put(dumpPage.getTitle(), exLabel);
                        case PreprocessedDocument.RegionTag.REGION_SPLIT /* 3 */:
                            String stripAllButInternalLinksAndEmphasis = this.stripper.stripAllButInternalLinksAndEmphasis(dumpPage.getMarkup(), ' ');
                            gatherCategoryLinksAndTranslations(dumpPage, stripAllButInternalLinksAndEmphasis, treeMap2, reporter);
                            String stripNonArticleInternalLinks = this.stripper.stripNonArticleInternalLinks(stripAllButInternalLinksAndEmphasis, ' ');
                            int i = 0;
                            int i2 = 0;
                            Iterator<Integer> it3 = collectSentenceSplits(dumpPage.getId(), stripNonArticleInternalLinks, reporter).iterator();
                            while (it3.hasNext()) {
                                int intValue = it3.next().intValue();
                                processSentence(stripNonArticleInternalLinks.substring(i2, intValue), i, dumpPage, hashMap, treeMap, reporter);
                                i2 = intValue;
                                i++;
                                reporter.progress();
                            }
                            processSentence(stripNonArticleInternalLinks.substring(i2), i, dumpPage, hashMap, treeMap, reporter);
                            break;
                        case 4:
                            Integer targetId = Util.getTargetId(dumpPage.getTarget(), this.articlesByTitle, this.redirectTargetsBySource);
                            if (targetId != null) {
                                ExLabel exLabel2 = new ExLabel(0L, 0L, 0L, 0L, new TreeMap());
                                exLabel2.getSensesById().put(targetId, new ExSenseForLabel(0L, 0L, false, true));
                                hashMap.put(dumpPage.getTitle(), exLabel2);
                                break;
                            }
                            break;
                    }
                    for (Map.Entry<String, ExLabel> entry : hashMap.entrySet()) {
                        outputCollector.collect(new Text(entry.getKey()), entry.getValue());
                    }
                    for (Map.Entry<Integer, ArrayList<Integer>> entry2 : treeMap.entrySet()) {
                        this.mos.getCollector(Output.tempPageLink.name(), reporter).collect(new IntWritable(dumpPage.getId()), new DbLinkLocation(entry2.getKey().intValue(), entry2.getValue()));
                    }
                    if (!treeMap2.isEmpty()) {
                        this.mos.getCollector(Output.translations.name(), reporter).collect(new IntWritable(dumpPage.getId()), new DbTranslations(treeMap2));
                    }
                }
            } catch (Exception e) {
                Logger.getLogger(LabelSensesMapper.class).error("Caught exception", e);
                StringWriter stringWriter = new StringWriter();
                e.printStackTrace(new PrintWriter(stringWriter));
                if (dumpPage != null) {
                    this.mos.getCollector(Output.fatalErrors.name(), reporter).collect(new IntWritable(dumpPage.getId()), new Text(stringWriter.toString().replace('\n', ';')));
                } else {
                    this.mos.getCollector(Output.fatalErrors.name(), reporter).collect(new IntWritable(-1), new Text(stringWriter.toString().replace('\n', ';')));
                }
            }
        }

        private TreeSet<Integer> collectSentenceSplits(int i, String str, Reporter reporter) throws IOException {
            TreeSet<Integer> treeSet = new TreeSet<>();
            String stripRegions = this.stripper.stripRegions(str, this.stripper.gatherComplexRegions(str, "\\[\\[", "\\]\\]"), 'a');
            String stripRegions2 = this.stripper.stripRegions(stripRegions, this.stripper.gatherComplexRegions(stripRegions, "\\(", "\\)"), 'a');
            for (Span span : this.sentenceDetector.sentPosDetect(stripRegions2)) {
                treeSet.add(Integer.valueOf(span.getEnd()));
            }
            Matcher matcher = this.paragraphSplitPattern.matcher(stripRegions2);
            int i2 = 0;
            while (true) {
                int i3 = i2;
                if (!matcher.find()) {
                    break;
                }
                int start = matcher.start();
                if (stripRegions2.substring(i3, start).trim().length() > 0) {
                    treeSet.add(Integer.valueOf(start));
                }
                i2 = start;
            }
            if (treeSet.size() > 0) {
                ArrayList arrayList = new ArrayList();
                Iterator<Integer> it = treeSet.iterator();
                while (it.hasNext()) {
                    arrayList.add(Integer.valueOf(it.next().intValue()));
                }
                this.mos.getCollector(Output.sentenceSplits.name(), reporter).collect(new IntWritable(i), new DbSentenceSplitList(arrayList));
            }
            return treeSet;
        }

        private void processSentence(String str, int i, DumpPage dumpPage, HashMap<String, ExLabel> hashMap, TreeMap<Integer, ArrayList<Integer>> treeMap, Reporter reporter) throws Exception {
            Iterator<int[]> it = this.stripper.gatherComplexRegions(str, "\\[\\[", "\\]\\]").iterator();
            while (it.hasNext()) {
                int[] next = it.next();
                String substring = str.substring(next[0] + 2, next[1] - 2);
                DumpLink dumpLink = null;
                try {
                    dumpLink = this.linkParser.parseLink(substring);
                } catch (Exception e) {
                    Logger.getLogger(LabelSensesMapper.class).warn("Could not parse link markup '" + substring + "'");
                }
                if (dumpLink != null && dumpLink.getTargetNamespace() == 0) {
                    Integer targetId = Util.getTargetId(dumpLink.getTargetTitle(), this.articlesByTitle, this.redirectTargetsBySource);
                    if (targetId != null) {
                        ExLabel exLabel = hashMap.get(dumpLink.getAnchor());
                        if (exLabel == null) {
                            exLabel = new ExLabel(1L, 1L, 0L, 0L, new TreeMap());
                            exLabel.getSensesById().put(targetId, new ExSenseForLabel(1L, 1L, false, false));
                        } else {
                            ExSenseForLabel exSenseForLabel = exLabel.getSensesById().get(targetId);
                            if (exSenseForLabel == null) {
                                exSenseForLabel = new ExSenseForLabel(1L, 1L, false, false);
                            } else {
                                exSenseForLabel.setLinkDocCount(1L);
                                exSenseForLabel.setLinkOccCount(exSenseForLabel.getLinkOccCount() + 1);
                            }
                            exLabel.setLinkOccCount(exLabel.getLinkOccCount() + 1);
                            exLabel.getSensesById().put(targetId, exSenseForLabel);
                        }
                        hashMap.put(dumpLink.getAnchor(), exLabel);
                        ArrayList<Integer> arrayList = treeMap.get(targetId);
                        if (arrayList == null) {
                            arrayList = new ArrayList<>();
                        }
                        if (arrayList.isEmpty() || arrayList.get(arrayList.size() - 1).intValue() < i) {
                            arrayList.add(Integer.valueOf(i));
                        }
                        treeMap.put(targetId, arrayList);
                    } else {
                        Logger.getLogger(LabelSensesMapper.class).warn("Could not resolve page link '" + dumpLink.getTargetTitle() + "'");
                    }
                }
            }
        }

        private void gatherCategoryLinksAndTranslations(DumpPage dumpPage, String str, TreeMap<String, String> treeMap, Reporter reporter) throws Exception {
            Iterator<int[]> it = this.stripper.gatherComplexRegions(str, "\\[\\[", "\\]\\]").iterator();
            while (it.hasNext()) {
                int[] next = it.next();
                String substring = str.substring(next[0] + 2, next[1] - 2);
                DumpLink dumpLink = null;
                try {
                    dumpLink = this.linkParser.parseLink(substring);
                } catch (Exception e) {
                    Logger.getLogger(LabelSensesMapper.class).warn("Could not parse link markup '" + substring + "'");
                }
                if (dumpLink != null) {
                    if (dumpLink.getTargetLanguage() != null) {
                        treeMap.put(dumpLink.getTargetLanguage(), dumpLink.getAnchor());
                    } else if (dumpLink.getTargetNamespace() == 14) {
                        Integer targetId = Util.getTargetId(dumpLink.getTargetTitle(), this.categoriesByTitle, null);
                        if (targetId == null) {
                            Logger.getLogger(LabelSensesMapper.class).warn("Could not resolve category link '" + dumpLink.getTargetTitle() + "'");
                        } else if (dumpPage.getNamespace() == 14) {
                            this.mos.getCollector(Output.tempCategoryParent.name(), reporter).collect(new IntWritable(dumpPage.getId()), new IntWritable(targetId.intValue()));
                        } else {
                            this.mos.getCollector(Output.tempArticleParent.name(), reporter).collect(new IntWritable(dumpPage.getId()), new IntWritable(targetId.intValue()));
                        }
                    }
                }
            }
        }

        public void close() throws IOException {
            super.close();
            this.mos.close();
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<Text, ExLabel>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$LabelSensesReducer.class */
    public static class LabelSensesReducer extends MapReduceBase implements Reducer<Text, ExLabel, Text, ExLabel> {
        public void reduce(Text text, Iterator<ExLabel> it, OutputCollector<Text, ExLabel> outputCollector, Reporter reporter) throws IOException {
            ExLabel exLabel = new ExLabel(0L, 0L, 0L, 0L, new TreeMap());
            while (it.hasNext()) {
                ExLabel next = it.next();
                for (Map.Entry<Integer, ExSenseForLabel> entry : next.getSensesById().entrySet()) {
                    ExSenseForLabel value = entry.getValue();
                    ExSenseForLabel exSenseForLabel = exLabel.getSensesById().get(entry.getKey());
                    if (exSenseForLabel == null) {
                        exSenseForLabel = value;
                    } else {
                        exSenseForLabel.setLinkOccCount(exSenseForLabel.getLinkOccCount() + value.getLinkOccCount());
                        exSenseForLabel.setLinkDocCount(exSenseForLabel.getLinkDocCount() + value.getLinkDocCount());
                        if (value.getFromRedirect()) {
                            exSenseForLabel.setFromRedirect(true);
                        }
                        if (value.getFromTitle()) {
                            exSenseForLabel.setFromTitle(true);
                        }
                    }
                    exLabel.getSensesById().put(entry.getKey(), exSenseForLabel);
                }
                exLabel.setLinkDocCount(exLabel.getLinkDocCount() + next.getLinkDocCount());
                exLabel.setLinkOccCount(exLabel.getLinkOccCount() + next.getLinkOccCount());
            }
            outputCollector.collect(text, exLabel);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((Text) obj, (Iterator<ExLabel>) it, (OutputCollector<Text, ExLabel>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelSensesStep$Output.class */
    public enum Output {
        tempLabel,
        tempPageLink,
        tempCategoryParent,
        tempArticleParent,
        sentenceSplits,
        translations,
        fatalErrors
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = new JobConf(LabelSensesStep.class);
        DumpExtractor.configureJob(jobConf, strArr);
        jobConf.setJobName("WM: gather label senses");
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(ExLabel.class);
        jobConf.setMapperClass(LabelSensesMapper.class);
        jobConf.setCombinerClass(LabelSensesReducer.class);
        jobConf.setReducerClass(LabelSensesReducer.class);
        jobConf.setInputFormat(XmlInputFormat.class);
        jobConf.set(XmlInputFormat.START_TAG_KEY, "<page>");
        jobConf.set(XmlInputFormat.END_TAG_KEY, "</page>");
        FileInputFormat.setInputPaths(jobConf, jobConf.get("wm.inputDir"));
        jobConf.setOutputFormat(LabelOutputFormat.class);
        FileOutputFormat.setOutputPath(jobConf, new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.labelSense)));
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.outputDir") + "/final/siteInfo.xml").toUri(), jobConf);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.langFile")).toUri(), jobConf);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.sentenceModel")).toUri(), jobConf);
        for (FileStatus fileStatus : FileSystem.get(jobConf).listStatus(new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.page)))) {
            if (fileStatus.getPath().getName().startsWith(PageStep.Output.tempPage.name())) {
                Logger.getLogger(LabelSensesStep.class).info("Cached page file " + fileStatus.getPath());
                DistributedCache.addCacheFile(fileStatus.getPath().toUri(), jobConf);
            }
        }
        for (FileStatus fileStatus2 : FileSystem.get(jobConf).listStatus(new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.redirect)))) {
            if (fileStatus2.getPath().getName().startsWith(RedirectStep.Output.redirectTargetsBySource.name())) {
                Logger.getLogger(LabelSensesStep.class).info("Cached redirect file " + fileStatus2.getPath());
                DistributedCache.addCacheFile(fileStatus2.getPath().toUri(), jobConf);
            }
        }
        MultipleOutputs.addNamedOutput(jobConf, Output.tempPageLink.name(), IntRecordOutputFormat.class, IntWritable.class, DbLinkLocation.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.tempCategoryParent.name(), TextOutputFormat.class, IntWritable.class, IntWritable.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.tempArticleParent.name(), TextOutputFormat.class, IntWritable.class, IntWritable.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.sentenceSplits.name(), IntRecordOutputFormat.class, IntWritable.class, DbSentenceSplitList.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.translations.name(), IntRecordOutputFormat.class, IntWritable.class, DbTranslations.class);
        MultipleOutputs.addNamedOutput(jobConf, Output.fatalErrors.name(), TextOutputFormat.class, IntWritable.class, Text.class);
        jobConf.set("mapred.textoutputformat.separator", ",");
        JobClient.runJob(jobConf);
        return 0;
    }
}
