/*
 * Decompiled with CFR 0.152.
 */
package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.tribuo.util.tokens.Token;
import org.tribuo.util.tokens.Tokenizer;
import org.tribuo.util.tokens.impl.WhitespaceTokenizer;
import org.tribuo.util.tokens.impl.wordpiece.Wordpiece;
import org.tribuo.util.tokens.impl.wordpiece.WordpieceBasicTokenizer;

public class WordpieceTokenizer
implements Tokenizer {
    private static final Pattern accentsPattern = Pattern.compile("\\p{Mn}");
    @Config(mandatory=true, description="an instance of Wordpiece which applies the 'wordpiece' algorithm")
    private Wordpiece wordpiece;
    @Config(description="determines whether or not to lowercase the input text")
    private boolean toLowerCase = true;
    @Config(description="performs whitespace tokenization before 'basic' tokenizer is applied (see basicTokenizer)")
    private Tokenizer whitespaceTokenizer = new WhitespaceTokenizer();
    @Config(description="performs some tokenization work on the input text before the wordpiece algorithm is applied to each resulting token.")
    private Tokenizer basicTokenizer = new WordpieceBasicTokenizer();
    @Config(description="determines whether or not to strip accents/diacritics from the input text")
    private boolean stripAccents = true;
    @Config(description="a set of 'token' strings that should never be split regardless of whether they have e.g., punctuation in the middle.  No entries should have whitespace in them.")
    private Set<String> neverSplitTokens = Collections.emptySet();
    private boolean reset;
    private Token currentToken;
    private List<Token> currentWordpieceTokens = new ArrayList<Token>();
    private int currentWordpieceIndex;

    private WordpieceTokenizer() {
    }

    public WordpieceTokenizer(Wordpiece wordpiece, Tokenizer tokenizer, boolean toLowerCase, boolean stripAccents, Set<String> neverSplit) {
        this.wordpiece = wordpiece;
        this.basicTokenizer = tokenizer;
        this.toLowerCase = toLowerCase;
        this.stripAccents = stripAccents;
        this.neverSplitTokens = neverSplit;
    }

    public ConfiguredObjectProvenance getProvenance() {
        return new ConfiguredObjectProvenanceImpl((Configurable)this, "Tokenizer");
    }

    @Override
    public void reset(CharSequence cs) {
        this.reset = true;
        this.whitespaceTokenizer.reset(cs);
        this.currentWordpieceTokens.clear();
        this.currentWordpieceIndex = -1;
        if (this.whitespaceTokenizer.advance()) {
            this.currentToken = this.whitespaceTokenizer.getToken();
            this.getWordpieceTokens();
        }
    }

    @Override
    public boolean advance() {
        if (!this.reset) {
            throw new IllegalStateException("WordpieceTokenizer has not been reset.");
        }
        ++this.currentWordpieceIndex;
        if (this.currentWordpieceIndex < this.currentWordpieceTokens.size()) {
            return true;
        }
        if (this.whitespaceTokenizer.advance()) {
            this.currentToken = this.whitespaceTokenizer.getToken();
            this.getWordpieceTokens();
            this.currentWordpieceIndex = 0;
            if (this.currentWordpieceTokens.size() == 0) {
                return this.advance();
            }
            return true;
        }
        return false;
    }

    private static String normalize(String text) {
        text = Normalizer.normalize(text, Normalizer.Form.NFD);
        text = accentsPattern.matcher(text).replaceAll("");
        return text;
    }

    private void getWordpieceTokens() {
        this.currentWordpieceTokens.clear();
        String text = this.currentToken.text;
        if (this.neverSplitTokens.contains(text)) {
            this.currentWordpieceTokens.add(this.currentToken);
            return;
        }
        List<Token> basicTokens = this.basicTokenizer.tokenize(text);
        for (Token basicToken : basicTokens) {
            List<String> wordpieces;
            text = basicToken.text;
            if (this.toLowerCase) {
                text = text.toLowerCase();
            }
            if (this.stripAccents) {
                text = WordpieceTokenizer.normalize(text);
            }
            if ((wordpieces = this.wordpiece.wordpiece(text)).size() == 0) {
                return;
            }
            if (wordpieces.size() == 1) {
                String wp = wordpieces.get(0);
                int start = basicToken.start + this.currentToken.start;
                int end = basicToken.end + this.currentToken.start;
                if (wp.equals(this.wordpiece.getUnknownToken())) {
                    this.currentWordpieceTokens.add(new Token(wp, start, end, Token.TokenType.UNKNOWN));
                    continue;
                }
                this.currentWordpieceTokens.add(new Token(wp, start, end, Token.TokenType.WORD));
                continue;
            }
            int begin = this.currentToken.start + basicToken.start;
            for (String wp : wordpieces) {
                Token.TokenType type = Token.TokenType.PREFIX;
                int end = begin + wp.length();
                if (wp.startsWith("##")) {
                    end -= 2;
                    type = Token.TokenType.SUFFIX;
                }
                this.currentWordpieceTokens.add(new Token(wp, begin, end, type));
                begin = end;
            }
        }
    }

    @Override
    public Token getToken() {
        if (this.currentWordpieceIndex < this.currentWordpieceTokens.size()) {
            return this.currentWordpieceTokens.get(this.currentWordpieceIndex);
        }
        throw new IllegalStateException("WordpieceTokenizer is not ready.");
    }

    @Override
    public String getText() {
        return this.getToken().text;
    }

    @Override
    public int getStart() {
        return this.getToken().start;
    }

    @Override
    public int getEnd() {
        return this.getToken().end;
    }

    @Override
    public Token.TokenType getType() {
        return this.getToken().type;
    }

    @Override
    public WordpieceTokenizer clone() {
        try {
            WordpieceTokenizer copy = (WordpieceTokenizer)super.clone();
            copy.whitespaceTokenizer = this.whitespaceTokenizer.clone();
            copy.basicTokenizer = this.basicTokenizer.clone();
            copy.reset = false;
            copy.currentToken = null;
            copy.currentWordpieceTokens.clear();
            copy.currentWordpieceIndex = -1;
            return copy;
        }
        catch (CloneNotSupportedException e) {
            throw new AssertionError((Object)"WordpieceTokenizer is Cloneable, but clone call failed");
        }
    }
}

