/*
 * Decompiled with CFR 0.152.
 */
package org.exist.storage.analysis;

import org.exist.storage.analysis.TextToken;
import org.exist.storage.analysis.Tokenizer;

public class SimpleTokenizer
implements Tokenizer {
    private int pos = 0;
    private boolean stem = false;
    private CharSequence text;
    private int len = 0;
    private final TextToken temp = new TextToken();

    public SimpleTokenizer() {
    }

    public SimpleTokenizer(boolean stem) {
        this.stem = stem;
    }

    public void setStemming(boolean stem) {
        this.stem = stem;
    }

    private final char LA(int i) {
        int current = this.pos + i;
        return current > this.len ? (char)'\uffff' : this.text.charAt(current - 1);
    }

    protected TextToken alpha(TextToken token, boolean allowWildcards) {
        if (token == null) {
            token = new TextToken(1, this.text, this.pos);
        } else {
            token.setType(1);
        }
        int oldPos = this.pos;
        char ch = this.LA(1);
        int count = 0;
        while (!(ch == '\uffff' || ch == '\\' && SimpleTokenizer.isWildcard(this.LA(2)))) {
            if (ch > '\u2e80' && SimpleTokenizer.singleCharToken(ch)) {
                if (count != 0) break;
                token.consumeNext();
                this.consume();
                ch = this.LA(1);
                break;
            }
            if (!Character.isLetter(ch) && !this.is_mark(ch) && !SimpleTokenizer.nonBreakingChar(ch) && (!allowWildcards || !SimpleTokenizer.isWildcard(ch))) break;
            token.consumeNext();
            this.consume();
            ch = this.LA(1);
            ++count;
        }
        if (Character.isDigit(ch)) {
            return this.alphanum(token, allowWildcards);
        }
        return token;
    }

    private static final boolean isWildcard(char ch) {
        return ch == '?' || ch == '*' || ch == '[' || ch == ']';
    }

    protected TextToken alphanum(TextToken token, boolean allowWildcards) {
        if (token == null) {
            token = new TextToken(2, this.text, this.pos);
        } else {
            token.setType(2);
        }
        while (this.LA(1) != '\uffff') {
            if (Character.isLetterOrDigit(this.LA(1))) {
                token.consumeNext();
                this.consume();
                continue;
            }
            if (!allowWildcards || !SimpleTokenizer.isWildcard(this.LA(1))) break;
            token.consumeNext();
            this.consume();
        }
        return token;
    }

    protected void consume() {
        ++this.pos;
    }

    protected TextToken eof() {
        this.consume();
        return TextToken.EOF_TOKEN;
    }

    public int getLength() {
        return this.len;
    }

    public String getText() {
        return ((Object)this.text).toString();
    }

    protected TextToken nextTerminalToken(boolean wildcards) {
        TextToken token = null;
        char ch = this.LA(1);
        if (ch == '\uffff') {
            return this.eof();
        }
        if (Character.isLetter(ch) || this.is_mark(ch) || SimpleTokenizer.nonBreakingChar(ch) || SimpleTokenizer.singleCharToken(ch) || wildcards && SimpleTokenizer.isWildcard(ch)) {
            token = this.alpha(null, wildcards);
        }
        if (token == null && (Character.isLetterOrDigit(ch) || wildcards && SimpleTokenizer.isWildcard(ch))) {
            token = this.alphanum(null, wildcards);
        }
        if (token == null) {
            switch (ch) {
                case '\\': {
                    if (SimpleTokenizer.isWildcard(this.LA(2))) {
                        this.consume();
                    }
                }
                case '*': 
                case ',': 
                case '-': 
                case '.': 
                case ':': 
                case '@': 
                case '_': {
                    token = this.p();
                    break;
                }
                default: {
                    token = this.whitespace();
                }
            }
        }
        return token;
    }

    public TextToken nextToken() {
        return this.nextToken(false);
    }

    public TextToken nextToken(boolean wildcards) {
        try {
            TextToken token = this.nextTerminalToken(wildcards);
            int oldPos = this.pos;
            char LA1 = this.LA(1);
            switch (token.getType()) {
                case -1: {
                    return null;
                }
                case 1: {
                    TextToken next;
                    StringBuffer buf;
                    boolean found = false;
                    if (LA1 == '\'') {
                        this.consume();
                        buf = new StringBuffer(token.getText());
                        next = this.nextTerminalToken(wildcards);
                        if (next != null && next.getType() == 1) {
                            buf.append('\'');
                            buf.append(next.getText());
                            return new TextToken(1, buf.toString());
                        }
                        this.pos = oldPos;
                    }
                    switch (LA1) {
                        case '.': 
                        case '/': 
                        case ':': 
                        case '_': {
                            if (this.LA(2) == '\uffff' || Character.isWhitespace(this.LA(2))) {
                                this.consume();
                                break;
                            }
                            found = false;
                            buf = new StringBuffer(token.getText());
                            while ((next = this.nextTerminalToken(wildcards)) != null && next.getType() != -1 && next.getType() != 8 && (next.getType() != 7 || this.LA(1) != '\uffff' && !Character.isWhitespace(this.LA(1)))) {
                                if (next.getType() == 2) {
                                    found = true;
                                }
                                buf.append(next.getText());
                            }
                            if (found) {
                                token = new TextToken(2, buf.toString());
                                break;
                            }
                            this.pos = oldPos;
                        }
                    }
                    return token;
                }
                case 2: {
                    switch (LA1) {
                        case '*': 
                        case ',': 
                        case '-': 
                        case '.': 
                        case '/': 
                        case ':': 
                        case '@': 
                        case '_': {
                            TextToken next;
                            if (this.LA(2) == '\uffff' || Character.isWhitespace(this.LA(2))) {
                                this.consume();
                                break;
                            }
                            StringBuffer buf = new StringBuffer(token.getText());
                            while ((next = this.nextTerminalToken(wildcards)) != null && next.getType() != -1 && next.getType() != 8) {
                                buf.append(next.getText());
                            }
                            token = new TextToken(2, buf.toString());
                        }
                    }
                    return token;
                }
            }
            return this.nextToken(wildcards);
        }
        catch (Exception e) {
            System.out.println("text: " + this.text);
            e.printStackTrace();
            return null;
        }
    }

    protected TextToken number() {
        TextToken token = new TextToken(6, this.text, this.pos);
        int oldPos = this.pos;
        while (this.LA(1) != '\uffff' && Character.isDigit(this.LA(1))) {
            token.consumeNext();
            this.consume();
        }
        if (Character.isLetter(this.LA(1))) {
            this.pos = oldPos;
            return null;
        }
        return token;
    }

    protected TextToken p() {
        this.temp.set(7, this.text, this.pos);
        this.temp.consumeNext();
        this.consume();
        return this.temp;
    }

    public void setText(CharSequence text) {
        this.pos = 0;
        this.len = text.length();
        this.text = text;
    }

    protected TextToken whitespace() {
        this.consume();
        return TextToken.WS_TOKEN;
    }

    private static final boolean singleCharToken(char ch) {
        return ch >= '\u2e80' && ch <= '\u2eff' || ch >= '\u2f00' && ch <= '\u2fdf' || ch >= '\u2ff0' && ch <= '\u2fff' || ch >= '\u3200' && ch <= '\u32ff' || ch >= '\u3300' && ch <= '\u33ff' || ch >= '\u3400' && ch <= '\u4db5' || ch >= '\u4dc0' && ch <= '\u4dff' || ch >= '\u4e00' && ch <= '\u9fff' || ch >= '\uf900' && ch <= '\ufaff' || ch >= '\ufe30' && ch <= '\ufe4f';
    }

    private static final boolean nonBreakingChar(char ch) {
        return ch >= '\u3040' && ch <= '\u309f' || ch >= '\u30a0' && ch <= '\u30ff' || ch >= '\u3100' && ch <= '\u312f' || ch >= '\u3130' && ch <= '\u318f' || ch >= '\u3190' && ch <= '\u319f' || ch >= '\u31a0' && ch <= '\u31bf' || ch >= '\u31f0' && ch <= '\u31ff' || ch >= '\uac00' && ch <= '\ud7a3';
    }

    private final boolean is_mark(char ch) {
        return ch > '\u093d' && ch < '\u094c';
    }

    public static void main(String[] args) {
        String t1 = "\u30a8\u30fb\u31a1\uacff\u2faa\u312a\u3045";
        String t2 = "\u9078\u5b98\u76cd\u82e5\u7a76\u51fa\u4e16\u6cd5\u4ee5\u9078\u4f5b\u90aa\u5e2b\u5584\u5176\u8a00\u6bc5\u6b32\u8d85";
        String t3 = "\ubb38\uc790 \uc0ac\uc6a9 \uc0c1\uc758 \uc624\ub958\ub97c \ucc3e\uc544\ub0b4\uae30 \uc704\ud574 \uac80\uc99d\ub41c \uc911\uad6d\uc5b4 \ud310\uc744 \uc7ac\uac80\ud1a0\ud558\uace0, \ubcf4\ub2e4 \uc77d\uae30 \uc27d\uac8c \ud558\uae30 \uc704\ud574 \uc5b8\uc5b4\uc801 \ud45c\ud604\uc744 \ub2e4\ub4ec\ub294\ub2e4.";
        SimpleTokenizer tokenizer = new SimpleTokenizer();
        tokenizer.setText(t2);
        TextToken token = tokenizer.nextToken(true);
        while (token != null && token.getType() != -1) {
            token = tokenizer.nextToken(true);
        }
    }
}

