package org.wikipedia.miner.extraction;

import gnu.trove.THashSet;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.record.CsvRecordInput;
import org.apache.hadoop.util.Tool;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extraction.DumpExtractor;
import org.wikipedia.miner.extraction.LabelSensesStep;
import org.wikipedia.miner.extraction.struct.ExLabel;
import org.wikipedia.miner.util.MarkupStripper;

/* loaded from: input_file:org/wikipedia/miner/extraction/LabelOccurrencesStep.class */
public class LabelOccurrencesStep extends Configured implements Tool {

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelOccurrencesStep$LabelOccurrencesMapper.class */
    private static class LabelOccurrencesMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, ExLabel> {
        private LanguageConfiguration lc;
        private SiteInfo si;
        private DumpPageParser pageParser;
        private Vector<Path> labelFiles = new Vector<>();
        private THashSet<String> labelVocabulary = null;
        private MarkupStripper stripper = new MarkupStripper();
        private int maxLabelLength = 15;

        private LabelOccurrencesMapper() {
        }

        public void configure(JobConf jobConf) {
            try {
                for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                    if (path.getName().equals(new Path("final/siteInfo.xml").getName())) {
                        this.si = new SiteInfo(path);
                    }
                    if (path.getName().equals(new Path(jobConf.get("wm.langFile")).getName())) {
                        this.lc = new LanguageConfiguration(jobConf.get("wm.langCode"), path);
                    }
                    if (path.getName().startsWith(LabelSensesStep.Output.tempLabel.name())) {
                        Logger.getLogger(LabelOccurrencesMapper.class).info("Located cached label file " + path.toString());
                        this.labelFiles.add(path);
                    }
                }
                if (this.si == null) {
                    throw new Exception("Could not locate 'final/siteInfo.xml' in DistributedCache");
                }
                if (this.lc == null) {
                    throw new Exception("Could not locate '" + jobConf.get("wm.langFile") + "' in DistributedCache");
                }
                if (this.labelFiles.isEmpty()) {
                    throw new Exception("Could not gather label files produced in step 3");
                }
                this.pageParser = new DumpPageParser(this.lc, this.si);
            } catch (Exception e) {
                Logger.getLogger(LabelOccurrencesMapper.class).error("Could not configure mapper", e);
                System.exit(1);
            }
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<Text, ExLabel> outputCollector, Reporter reporter) throws IOException {
            try {
                if (this.labelVocabulary == null) {
                    this.labelVocabulary = new THashSet<>();
                    Iterator<Path> it = this.labelFiles.iterator();
                    while (it.hasNext()) {
                        this.labelVocabulary = gatherLabelVocabulary(it.next(), this.labelVocabulary, reporter);
                    }
                }
                DumpPage parsePage = this.pageParser.parsePage(text.toString());
                if (parsePage != null) {
                    HashMap hashMap = new HashMap();
                    String str = "$ " + this.stripper.stripToPlainText(parsePage.getMarkup(), null) + " $";
                    Matcher matcher = Pattern.compile("[\\s\\{\\}\\(\\)\"'\\.\\,\\;\\:\\-\\_]").matcher(str);
                    Vector vector = new Vector();
                    while (matcher.find()) {
                        vector.add(Integer.valueOf(matcher.start()));
                    }
                    for (int i = 0; i < vector.size(); i++) {
                        int intValue = ((Integer) vector.elementAt(i)).intValue() + 1;
                        if (!Character.isWhitespace(str.charAt(intValue))) {
                            for (int min = Math.min(i + this.maxLabelLength, vector.size() - 1); min > i; min--) {
                                String substring = str.substring(intValue, ((Integer) vector.elementAt(min)).intValue());
                                if ((substring.length() != 1 || !str.substring(intValue - 1, intValue).equals("'")) && !substring.trim().equals("") && this.labelVocabulary.contains(substring)) {
                                    ExLabel exLabel = (ExLabel) hashMap.get(substring);
                                    if (exLabel == null) {
                                        exLabel = new ExLabel(0L, 0L, 1L, 1L, new TreeMap());
                                    } else {
                                        exLabel.setTextOccCount(exLabel.getTextOccCount() + 1);
                                    }
                                    hashMap.put(substring, exLabel);
                                }
                            }
                        }
                    }
                    for (Map.Entry entry : hashMap.entrySet()) {
                        outputCollector.collect(new Text((String) entry.getKey()), entry.getValue());
                    }
                }
            } catch (Exception e) {
                Logger.getLogger(LabelOccurrencesMapper.class).error("Caught exception", e);
            }
        }

        private THashSet<String> gatherLabelVocabulary(Path path, THashSet<String> tHashSet, Reporter reporter) throws IOException {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(path.toString()));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    return tHashSet;
                }
                try {
                    tHashSet.add(new CsvRecordInput(new ByteArrayInputStream(readLine.getBytes("UTF8"))).readString("labelText"));
                    reporter.progress();
                } catch (Exception e) {
                    Logger.getLogger(LabelOccurrencesMapper.class).error("Caught exception while gathering label from '" + readLine + "' in '" + path + "'", e);
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<Text, ExLabel>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:org/wikipedia/miner/extraction/LabelOccurrencesStep$LabelOccurrencesReducer.class */
    public static class LabelOccurrencesReducer extends MapReduceBase implements Reducer<Text, ExLabel, Text, ExLabel> {
        public void reduce(Text text, Iterator<ExLabel> it, OutputCollector<Text, ExLabel> outputCollector, Reporter reporter) throws IOException {
            ExLabel exLabel = new ExLabel(0L, 0L, 0L, 0L, new TreeMap());
            while (it.hasNext()) {
                ExLabel next = it.next();
                exLabel.setTextDocCount(exLabel.getTextDocCount() + next.getTextDocCount());
                exLabel.setTextOccCount(exLabel.getTextOccCount() + next.getTextOccCount());
            }
            outputCollector.collect(text, exLabel);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((Text) obj, (Iterator<ExLabel>) it, (OutputCollector<Text, ExLabel>) outputCollector, reporter);
        }
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = new JobConf(LabelOccurrencesStep.class);
        DumpExtractor.configureJob(jobConf, strArr);
        jobConf.setJobName("WM: count label occurrences");
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(ExLabel.class);
        jobConf.setMapperClass(LabelOccurrencesMapper.class);
        jobConf.setCombinerClass(LabelOccurrencesReducer.class);
        jobConf.setReducerClass(LabelOccurrencesReducer.class);
        jobConf.setInputFormat(XmlInputFormat.class);
        jobConf.set(XmlInputFormat.START_TAG_KEY, "<page>");
        jobConf.set(XmlInputFormat.END_TAG_KEY, "</page>");
        FileInputFormat.setInputPaths(jobConf, jobConf.get("wm.inputDir"));
        jobConf.setOutputFormat(LabelSensesStep.LabelOutputFormat.class);
        FileOutputFormat.setOutputPath(jobConf, new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.labelOccurrence)));
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.outputDir") + "/final/siteInfo.xml").toUri(), jobConf);
        DistributedCache.addCacheFile(new Path(jobConf.get("wm.langFile")).toUri(), jobConf);
        for (FileStatus fileStatus : FileSystem.get(jobConf).listStatus(new Path(jobConf.get("wm.outputDir") + "/" + DumpExtractor.getDirectoryName(DumpExtractor.ExtractionStep.labelSense)))) {
            if (fileStatus.getPath().getName().startsWith(LabelSensesStep.Output.tempLabel.name())) {
                Logger.getLogger(LabelOccurrencesStep.class).info("Cached temporary label file " + fileStatus.getPath());
                DistributedCache.addCacheFile(fileStatus.getPath().toUri(), jobConf);
            }
        }
        JobClient.runJob(jobConf);
        return 0;
    }
}
