package org.unicode.cldr.util;

import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/* loaded from: input_file:org/unicode/cldr/util/Segmenter.class */
public class Segmenter {
    private static final boolean JDK4HACK = true;
    public static final int REGEX_FLAGS = 44;
    private static final boolean SHOW_VAR_CONTENTS = false;
    public static final double NOBREAK_SUPPLEMENTARY = 0.1d;
    public static final double BREAK_SOT = 0.2d;
    public static final double BREAK_EOT = 0.3d;
    public static final double BREAK_ANY = 999.0d;
    static Comparator LONGEST_STRING_FIRST;
    private double breakRule;
    static String[][] cannedRules;
    public static final UnicodeSet DEBUG_RETAIN = null;
    public static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH);
    private UnicodeMap samples = new UnicodeMap();
    private List rules = new ArrayList(1);
    private List orders = new ArrayList(1);

    /* loaded from: input_file:org/unicode/cldr/util/Segmenter$Builder.class */
    public static class Builder {
        private UnicodeProperty.Factory propFactory;
        static boolean SHOW_SAMPLES = false;
        static MyComposer myComposer = new MyComposer();
        static UnicodeSet JavaRegex_uxxx = new UnicodeSet("[[:White_Space:][:defaultignorablecodepoint:]#]");
        static UnicodeSet JavaRegex_slash = new UnicodeSet("[[:Pattern_White_Space:]\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]");
        static CodePointShower JavaRegexShower = new CodePointShower() { // from class: org.unicode.cldr.util.Segmenter.2
            @Override // org.unicode.cldr.util.Segmenter.CodePointShower
            public String show(int i) {
                return Builder.JavaRegex_uxxx.contains(i) ? i > 65535 ? new StringBuffer().append("\\u").append(com.ibm.icu.impl.Utility.hex(UTF16.getLeadSurrogate(i))).append("\\u").append(com.ibm.icu.impl.Utility.hex(UTF16.getTrailSurrogate(i))).toString() : new StringBuffer().append("\\u").append(com.ibm.icu.impl.Utility.hex(i)).toString() : Builder.JavaRegex_slash.contains(i) ? new StringBuffer().append("\\").append(UTF16.valueOf(i)).toString() : UTF16.valueOf(i);
            }
        };
        private List rawVariables = new ArrayList();
        private Map xmlRules = new TreeMap();
        private Map htmlRules = new TreeMap();
        private List lastComments = new ArrayList();
        private UnicodeMap samples = new UnicodeMap();
        private transient Matcher whiteSpace = Pattern.compile("\\s+", 44).matcher("");
        private transient Matcher identifierMatcher = Pattern.compile("[$]\\p{Alpha}\\p{Alnum}*", 44).matcher("");
        private transient Matcher brokenIdentifierMatcher = Pattern.compile("[^$\\p{Alpha}]\\p{Alnum}", 44).matcher("");
        private Map variables = new TreeMap(Segmenter.LONGEST_STRING_FIRST);
        private Map rules = new TreeMap();
        transient ParsePosition parsePosition = new ParsePosition(0);
        private UnicodeSet.XSymbolTable symbolTable = new MyXSymbolTable(this, null);

        /* JADX INFO: Access modifiers changed from: package-private */
        /* loaded from: input_file:org/unicode/cldr/util/Segmenter$Builder$MyComposer.class */
        public static class MyComposer implements UnicodeMap.Composer {
            MyComposer() {
            }

            public Object compose(int i, Object obj, Object obj2) {
                if (obj == null) {
                    return obj2;
                }
                if (obj2 != null && !obj.equals(obj2)) {
                    return new StringBuffer().append(obj).append("_").append(obj2).toString();
                }
                return obj;
            }
        }

        /* loaded from: input_file:org/unicode/cldr/util/Segmenter$Builder$MyXSymbolTable.class */
        private class MyXSymbolTable extends UnicodeSet.XSymbolTable {
            private final Builder this$0;

            private MyXSymbolTable(Builder builder) {
                this.this$0 = builder;
            }

            public boolean applyPropertyAlias(String str, String str2, UnicodeSet unicodeSet) {
                UnicodeProperty property = this.this$0.propFactory.getProperty(str);
                if (property == null) {
                    return false;
                }
                unicodeSet.clear();
                return property.getSet(str2, unicodeSet).size() != 0;
            }

            MyXSymbolTable(Builder builder, AnonymousClass1 anonymousClass1) {
                this(builder);
            }
        }

        public Builder(UnicodeProperty.Factory factory) {
            this.propFactory = factory;
            this.htmlRules.put(new Double(0.2d), "sot ÷");
            this.htmlRules.put(new Double(0.3d), "÷ eot");
            this.htmlRules.put(new Double(999.0d), "÷ Any");
        }

        public String toString(String str, String str2) {
            StringBuffer stringBuffer = new StringBuffer();
            stringBuffer.append(new StringBuffer().append(str2).append("<segmentation type=\"").append(str).append("\">").toString()).append("\r\n");
            stringBuffer.append(new StringBuffer().append(str2).append("\t<variables>").toString()).append("\r\n");
            for (int i = 0; i < this.rawVariables.size(); i++) {
                stringBuffer.append(new StringBuffer().append(str2).append("\t\t").toString()).append(this.rawVariables.get(i)).append("\r\n");
            }
            stringBuffer.append(new StringBuffer().append(str2).append("\t</variables>").toString()).append("\r\n");
            stringBuffer.append(new StringBuffer().append(str2).append("\t<segmentRules>").toString()).append("\r\n");
            Iterator it = this.xmlRules.keySet().iterator();
            while (it.hasNext()) {
                stringBuffer.append(new StringBuffer().append(str2).append("\t\t").toString()).append(this.xmlRules.get(it.next())).append("\r\n");
            }
            stringBuffer.append(new StringBuffer().append(str2).append("\t</segmentRules>").toString()).append("\r\n");
            for (int i2 = 0; i2 < this.lastComments.size(); i2++) {
                stringBuffer.append(new StringBuffer().append(str2).append("\t").toString()).append(this.lastComments.get(i2)).append("\r\n");
            }
            stringBuffer.append(new StringBuffer().append(str2).append("</segmentation>").toString()).append("\r\n");
            return stringBuffer.toString();
        }

        public boolean addLine(String str) {
            if (str.startsWith("show")) {
                String trim = str.substring(4).trim();
                System.out.println(new StringBuffer().append("# ").append(trim).append(": ").toString());
                System.out.println(new StringBuffer().append("\t").append(replaceVariables(trim)).toString());
                return false;
            }
            if (str.startsWith("#")) {
                this.lastComments.add(new StringBuffer().append("<!-- ").append(str.substring(1).trim()).append(" -->").toString());
                return false;
            }
            int indexOf = str.indexOf(61);
            if (indexOf >= 0) {
                addVariable(str.substring(0, indexOf).trim(), str.substring(indexOf + 1).trim());
                return false;
            }
            int indexOf2 = str.indexOf(41);
            try {
                Double d = new Double(Double.parseDouble(str.substring(0, indexOf2).trim()));
                String trim2 = str.substring(indexOf2 + 1).trim();
                int indexOf3 = trim2.indexOf(247);
                byte b = 1;
                if (indexOf3 < 0) {
                    indexOf3 = trim2.indexOf(215);
                    if (indexOf3 < 0) {
                        throw new IllegalArgumentException("Couldn't find =, ÷, or ×");
                    }
                    b = -1;
                }
                addRule(d, trim2.substring(0, indexOf3).trim(), b, trim2.substring(indexOf3 + 1).trim(), trim2);
                return true;
            } catch (Exception e) {
                throw new IllegalArgumentException(new StringBuffer().append("Rule must be of form '1)...': ").append(str).toString());
            }
        }

        Builder addVariable(String str, String str2) {
            if (this.lastComments.size() != 0) {
                this.rawVariables.addAll(this.lastComments);
                this.lastComments.clear();
            }
            this.rawVariables.add(new StringBuffer().append("<variable id=\"").append(str).append("\">").append(TransliteratorUtilities.toXML.transliterate(str2)).append("</variable>").toString());
            if (!this.identifierMatcher.reset(str).matches()) {
                throw new IllegalArgumentException(new StringBuffer().append("Variable name must be $id: '").append(str).append("'").toString());
            }
            String replaceVariables = replaceVariables(str2);
            try {
                this.parsePosition.setIndex(0);
                UnicodeSet unicodeSet = new UnicodeSet(replaceVariables, this.parsePosition, this.symbolTable);
                if (this.parsePosition.getIndex() != replaceVariables.length()) {
                    if (SHOW_SAMPLES) {
                        System.out.println(new StringBuffer().append(this.parsePosition.getIndex()).append(", ").append(replaceVariables.length()).append(" -- No samples for: ").append(str).append(" = ").append(replaceVariables).toString());
                    }
                } else if (unicodeSet.size() != 0) {
                    String str3 = str;
                    if (str3.startsWith("$")) {
                        str3 = str3.substring(1);
                    }
                    composeWith(this.samples, unicodeSet, str3, myComposer);
                    if (SHOW_SAMPLES) {
                        System.out.println(new StringBuffer().append("Samples for: ").append(str).append(" = ").append(replaceVariables).toString());
                        System.out.println(new StringBuffer().append("\t").append(unicodeSet).toString());
                    }
                } else if (SHOW_SAMPLES) {
                    System.out.println(new StringBuffer().append("Empty -- No samples for: ").append(str).append(" = ").append(replaceVariables).toString());
                }
            } catch (Exception e) {
            }
            Pattern.compile(replaceVariables, 44).matcher("");
            this.variables.put(str, replaceVariables);
            return this;
        }

        public static UnicodeMap composeWith(UnicodeMap unicodeMap, UnicodeSet unicodeSet, Object obj, UnicodeMap.Composer composer) {
            UnicodeSetIterator unicodeSetIterator = new UnicodeSetIterator(unicodeSet);
            while (unicodeSetIterator.next()) {
                int i = unicodeSetIterator.codepoint;
                Object value = unicodeMap.getValue(i);
                Object compose = composer.compose(i, value, obj);
                if (value != compose && (value == null || !value.equals(compose))) {
                    unicodeMap.put(i, compose);
                }
            }
            return unicodeMap;
        }

        private void findRegexProblem(String str) {
            UnicodeSet unicodeSet = new UnicodeSet(str);
            String str2 = null;
            for (int i = 0; i < unicodeSet.size(); i++) {
                try {
                    str2 = getInsertablePattern(new UnicodeSet(unicodeSet).retain(0, unicodeSet.charAt(i)));
                    Pattern.compile(str2, 44).matcher("");
                } catch (PatternSyntaxException e) {
                    int index = e.getIndex();
                    throw ((RuntimeException) new IllegalArgumentException(new StringBuffer().append("Can't parse: ").append(str2.substring(0, index)).append("<<<>>>").append(str2.substring(index)).toString()).initCause(e));
                }
            }
        }

        Builder addRule(Double d, String str, byte b, String str2, String str3) {
            if (this.brokenIdentifierMatcher.reset(str3).find()) {
                int start = this.brokenIdentifierMatcher.start();
                throw new IllegalArgumentException(new StringBuffer().append("Illegal identifier at:").append(str3.substring(0, start)).append("<<>>").append(str3.substring(start)).toString());
            }
            String replaceAll = this.whiteSpace.reset(str3).replaceAll(" ");
            if (this.lastComments.size() != 0) {
                double doubleValue = d.doubleValue() - (1.0E-4d * this.lastComments.size());
                for (int i = 0; i < this.lastComments.size(); i++) {
                    Double d2 = new Double(doubleValue);
                    if (this.xmlRules.containsKey(d2)) {
                        System.out.println(new StringBuffer().append("WARNING: Overriding rule ").append(d2).toString());
                    }
                    this.xmlRules.put(d2, this.lastComments.get(i));
                    doubleValue += 1.0E-4d;
                }
                this.lastComments.clear();
            }
            this.htmlRules.put(d, TransliteratorUtilities.toHTML.transliterate(replaceAll));
            this.xmlRules.put(d, new StringBuffer().append("<rule id=\"").append(Segmenter.nf.format(d)).append("\"").append("> ").append(TransliteratorUtilities.toXML.transliterate(replaceAll)).append(" </rule>").toString());
            this.rules.put(d, new Rule(replaceVariables(str), b, replaceVariables(str2), replaceAll));
            return this;
        }

        public Segmenter make() {
            Segmenter segmenter = new Segmenter();
            for (Double d : this.rules.keySet()) {
                segmenter.add(d.doubleValue(), (Rule) this.rules.get(d));
            }
            segmenter.samples = this.samples;
            return segmenter;
        }

        private String replaceVariables(String str) {
            String str2 = str;
            int i = -1;
            while (true) {
                i = str2.indexOf(36, i);
                if (i < 0) {
                    int i2 = 0;
                    while (i2 < str2.length()) {
                        if (UnicodeSet.resemblesPattern(str2, i2)) {
                            this.parsePosition.setIndex(i2);
                            String insertablePattern = getInsertablePattern(new UnicodeSet(str2, this.parsePosition, this.symbolTable));
                            str2 = new StringBuffer().append(str2.substring(0, i2)).append(insertablePattern).append(str2.substring(this.parsePosition.getIndex())).toString();
                            i2 += insertablePattern.length() - 1;
                        }
                        i2++;
                    }
                    return str2;
                }
                Iterator it = this.variables.keySet().iterator();
                while (true) {
                    if (it.hasNext()) {
                        String str3 = (String) it.next();
                        if (str2.regionMatches(i, str3, 0, str3.length())) {
                            String str4 = (String) this.variables.get(str3);
                            str2 = new StringBuffer().append(str2.substring(0, i)).append(str4).append(str2.substring(i + str3.length())).toString();
                            i += str4.length();
                            break;
                        }
                    } else if (this.identifierMatcher.reset(str2.substring(i)).lookingAt()) {
                        throw new IllegalArgumentException(new StringBuffer().append("Illegal variable at: '").append(str2.substring(i)).append("'").toString());
                    }
                }
            }
        }

        private String getInsertablePattern(UnicodeSet unicodeSet) {
            unicodeSet.complement().complement();
            unicodeSet.remove(65536, 1114111);
            if (Segmenter.DEBUG_RETAIN != null) {
                unicodeSet.retainAll(Segmenter.DEBUG_RETAIN);
                if (unicodeSet.size() == 0) {
                    unicodeSet.add(65535);
                }
            }
            String pattern = toPattern(unicodeSet, JavaRegexShower);
            if (new UnicodeSet(pattern).equals(unicodeSet)) {
                return pattern;
            }
            throw new IllegalArgumentException("Failure on UnicodeSet print");
        }

        private static String toPattern(UnicodeSet unicodeSet, CodePointShower codePointShower) {
            StringBuffer stringBuffer = new StringBuffer();
            stringBuffer.append('[');
            UnicodeSetIterator unicodeSetIterator = new UnicodeSetIterator(unicodeSet);
            while (unicodeSetIterator.nextRange()) {
                int i = unicodeSetIterator.codepoint;
                int i2 = i + 1;
                stringBuffer.append(codePointShower.show(i));
                if (i2 <= unicodeSetIterator.codepointEnd) {
                    if (i2 != unicodeSetIterator.codepointEnd) {
                        stringBuffer.append('-');
                    }
                    stringBuffer.append(codePointShower.show(unicodeSetIterator.codepointEnd));
                }
            }
            stringBuffer.append(']');
            return stringBuffer.toString();
        }

        public Map getVariables() {
            return Collections.unmodifiableMap(this.variables);
        }

        public List getRules() {
            ArrayList arrayList = new ArrayList();
            for (Object obj : this.htmlRules.keySet()) {
                arrayList.add(new StringBuffer().append(obj).append(")\t").append(this.htmlRules.get(obj)).toString());
            }
            return arrayList;
        }
    }

    /* loaded from: input_file:org/unicode/cldr/util/Segmenter$CodePointShower.class */
    public interface CodePointShower {
        String show(int i);
    }

    /* loaded from: input_file:org/unicode/cldr/util/Segmenter$Rule.class */
    public static class Rule {
        public static final byte NO_BREAK = -1;
        public static final byte UNKNOWN_BREAK = 0;
        public static final byte BREAK = 1;
        private Matcher matchPrevious;
        private Matcher matchSucceeding;
        private String name;
        private String resolved;
        private byte breaks;

        public Rule(String str, byte b, String str2, String str3) {
            this.breaks = b;
            String stringBuffer = new StringBuffer().append(".*(").append(str).append(")").toString();
            String str4 = null;
            try {
                this.matchPrevious = Pattern.compile(stringBuffer, 44).matcher("");
                str4 = str2;
                this.matchSucceeding = Pattern.compile(str2, 44).matcher("");
                this.name = str3;
                this.resolved = new StringBuffer().append(com.ibm.icu.impl.Utility.escape(stringBuffer)).append(b == -1 ? " × " : " ÷ ").append(com.ibm.icu.impl.Utility.escape(str2)).toString();
            } catch (PatternSyntaxException e) {
                int index = e.getIndex();
                throw ((RuntimeException) new IllegalArgumentException(new StringBuffer().append("On <").append(str3).append(">, Can't parse: ").append(str4.substring(0, index)).append("<<<>>>").append(str4.substring(index)).toString()).initCause(e));
            } catch (RuntimeException e2) {
                throw ((RuntimeException) new IllegalArgumentException(new StringBuffer().append("On <").append(str3).append(">, Can't parse: ").append(str4).toString()).initCause(e2));
            }
        }

        public byte matches(CharSequence charSequence, int i) {
            if (Segmenter.matchAfter(this.matchSucceeding, charSequence, i) && Segmenter.matchBefore(this.matchPrevious, charSequence, i)) {
                return this.breaks;
            }
            return (byte) 0;
        }

        public String toString() {
            return toString(false);
        }

        public String toString(boolean z) {
            String str = this.name;
            if (z) {
                str = new StringBuffer().append(str).append(": ").append(this.resolved).toString();
            }
            return str;
        }
    }

    public static Builder make(UnicodeProperty.Factory factory, String str) {
        Builder builder = new Builder(factory);
        for (int i = 0; i < cannedRules.length; i++) {
            if (cannedRules[i][0].equals(str)) {
                for (int i2 = 1; i2 < cannedRules[i].length; i2++) {
                    builder.addLine(cannedRules[i][i2]);
                }
                return builder;
            }
        }
        return null;
    }

    public boolean breaksAt(CharSequence charSequence, int i) {
        if (i == 0) {
            this.breakRule = 0.2d;
            return true;
        }
        if (i == charSequence.length()) {
            this.breakRule = 0.3d;
            return true;
        }
        if (UTF16.isLeadSurrogate(charSequence.charAt(i - 1)) && UTF16.isTrailSurrogate(charSequence.charAt(i))) {
            this.breakRule = 0.1d;
            return false;
        }
        for (int i2 = 0; i2 < this.rules.size(); i2++) {
            byte matches = ((Rule) this.rules.get(i2)).matches(charSequence, i);
            if (matches != 0) {
                this.breakRule = ((Double) this.orders.get(i2)).doubleValue();
                return matches == 1;
            }
        }
        this.breakRule = 999.0d;
        return true;
    }

    public int getRuleStatusVec(int[] iArr) {
        iArr[0] = 0;
        return 1;
    }

    public void add(double d, Rule rule) {
        this.orders.add(new Double(d));
        this.rules.add(rule);
    }

    public Rule get(double d) {
        int indexOf = this.orders.indexOf(new Double(d));
        if (indexOf < 0) {
            return null;
        }
        return (Rule) this.rules.get(indexOf);
    }

    public double getBreakRule() {
        return this.breakRule;
    }

    public String toString() {
        return toString(false);
    }

    public String toString(boolean z) {
        String str = "";
        for (int i = 0; i < this.rules.size(); i++) {
            if (i != 0) {
                str = new StringBuffer().append(str).append("\r\n").toString();
            }
            str = new StringBuffer().append(str).append(this.orders.get(i)).append(")\t").append(((Rule) this.rules.get(i)).toString(z)).toString();
        }
        return str;
    }

    static boolean matchAfter(Matcher matcher, CharSequence charSequence, int i) {
        return matcher.reset(charSequence.subSequence(i, charSequence.length())).lookingAt();
    }

    static boolean matchBefore(Matcher matcher, CharSequence charSequence, int i) {
        return matcher.reset(charSequence.subSequence(0, i)).matches();
    }

    public UnicodeMap getSamples() {
        return this.samples;
    }

    /* JADX WARN: Type inference failed for: r0v6, types: [java.lang.String[], java.lang.String[][]] */
    static {
        nf.setMinimumFractionDigits(0);
        LONGEST_STRING_FIRST = new Comparator() { // from class: org.unicode.cldr.util.Segmenter.1
            @Override // java.util.Comparator
            public int compare(Object obj, Object obj2) {
                String obj3 = obj.toString();
                String obj4 = obj2.toString();
                int length = obj3.length();
                int length2 = obj4.length();
                if (length < length2) {
                    return 1;
                }
                if (length > length2) {
                    return -1;
                }
                return obj3.compareTo(obj4);
            }
        };
        cannedRules = new String[]{new String[]{"GraphemeClusterBreak", "$CR=\\p{Grapheme_Cluster_Break=CR}", "$LF=\\p{Grapheme_Cluster_Break=LF}", "$Control=\\p{Grapheme_Cluster_Break=Control}", "$Extend=\\p{Grapheme_Cluster_Break=Extend}", "$L=\\p{Grapheme_Cluster_Break=L}", "$V=\\p{Grapheme_Cluster_Break=V}", "$T=\\p{Grapheme_Cluster_Break=T}", "$LV=\\p{Grapheme_Cluster_Break=LV}", "$LVT=\\p{Grapheme_Cluster_Break=LVT}", "3) $CR  \t×  \t$LF", "4) ( $Control | $CR | $LF ) \t÷", "5) ÷ \t( $Control | $CR | $LF )", "6) $L \t× \t( $L | $V | $LV | $LVT )", "7) ( $LV | $V ) \t× \t( $V | $T )", "8) ( $LVT | $T) \t× \t$T", "9) × \t$Extend"}, new String[]{"LineBreak", "# Variables", "$AI=\\p{Line_Break=Ambiguous}", "$AL=\\p{Line_Break=Alphabetic}", "$B2=\\p{Line_Break=Break_Both}", "$BA=\\p{Line_Break=Break_After}", "$BB=\\p{Line_Break=Break_Before}", "$BK=\\p{Line_Break=Mandatory_Break}", "$CB=\\p{Line_Break=Contingent_Break}", "$CL=\\p{Line_Break=Close_Punctuation}", "$CM=\\p{Line_Break=Combining_Mark}", "$CR=\\p{Line_Break=Carriage_Return}", "$EX=\\p{Line_Break=Exclamation}", "$GL=\\p{Line_Break=Glue}", "$H2=\\p{Line_Break=H2}", "$H3=\\p{Line_Break=H3}", "$HY=\\p{Line_Break=Hyphen}", "$ID=\\p{Line_Break=Ideographic}", "$IN=\\p{Line_Break=Inseparable}", "$IS=\\p{Line_Break=Infix_Numeric}", "$JL=\\p{Line_Break=JL}", "$JT=\\p{Line_Break=JT}", "$JV=\\p{Line_Break=JV}", "$LF=\\p{Line_Break=Line_Feed}", "$NL=\\p{Line_Break=Next_Line}", "$NS=\\p{Line_Break=Nonstarter}", "$NU=\\p{Line_Break=Numeric}", "$OP=\\p{Line_Break=Open_Punctuation}", "$PO=\\p{Line_Break=Postfix_Numeric}", "$PR=\\p{Line_Break=Prefix_Numeric}", "$QU=\\p{Line_Break=Quotation}", "$SA=\\p{Line_Break=Complex_Context}", "$SG=\\p{Line_Break=Surrogate}", "$SP=\\p{Line_Break=Space}", "$SY=\\p{Line_Break=Break_Symbols}", "$WJ=\\p{Line_Break=Word_Joiner}", "$XX=\\p{Line_Break=Unknown}", "$ZW=\\p{Line_Break=ZWSpace}", "# LB 1  Assign a line breaking class to each code point of the input. ", "# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.", "# NOTE: CB is ok to fall through, but must handle others here.", "$AL=[$AI $AL $XX $SA $SG]", "# WARNING: Fixes for Rule 9", "# Treat X CM* as if it were X.", "# Where X is any line break class except SP, BK, CR, LF, NL or ZW.", "$X=$CM*", "$AI=($AI $X)", "$AL=($AL $X)", "$B2=($B2 $X)", "$BA=($BA $X)", "$BB=($BB $X)", "$CB=($CB $X)", "$CL=($CL $X)", "$CM=($CM $X)", "$CM=($CM $X)", "$GL=($GL $X)", "$H2=($H2 $X)", "$H3=($H3 $X)", "$HY=($HY $X)", "$ID=($ID $X)", "$IN=($IN $X)", "$IS=($IS $X)", "$JL=($JL $X)", "$JT=($JT $X)", "$JV=($JV $X)", "$NS=($NS $X)", "$NU=($NU $X)", "$OP=($OP $X)", "$PO=($PO $X)", "$PR=($PR $X)", "$QU=($QU $X)", "$SA=($SA $X)", "$SG=($SG $X)", "$SY=($SY $X)", "$WJ=($WJ $X)", "$XX=($XX $X)", "# OUT OF ORDER ON PURPOSE", "# LB 10  Treat any remaining combining mark as AL.", "$AL=($AL | ^ $CM | (?<=[$SP $BK $CR $LF $NL $ZW]) $CM)", "# LB 4  Always break after hard line breaks (but never between CR and LF).", "4) $BK ÷", "# LB 5  Treat CR followed by LF, as well as CR, LF and NL as hard line breaks.", "5.01) $CR × $LF", "5.02) $CR ÷", "5.03) $LF ÷", "5.04) $NL ÷", "# LB 6  Do not break before hard line breaks.", "6) × ( $BK | $CR | $LF | $NL )", "# LB 7  Do not break before spaces or zero-width space.", "7.01) × $SP", "7.02) × $ZW", "# LB 8  Break after zero-width space.", "8) $ZW ÷", "# LB 9  Do not break a combining character sequence; treat it as if it has the LB class of the base character", "# in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.)", "9) [^$SP $BK $CR $LF $NL $ZW] × $CM", "#WARNING: this is done by modifying the variable values for all but SP.... That is, $AL is really ($AI $CM*)!", "# LB 11  Do not break before or after WORD JOINER and related characters.", "11.01) × $WJ", "11.02) $WJ ×", "# LB 12  Do not break before or after NBSP and related characters.", "12.01) [^$SP] × $GL", "12.02) $GL ×", "# LB 13  Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.", "# Using customization 7.", "13.01) [^$NU] × $CL", "13.02) × $EX", "13.03) [^$NU] × $IS", "13.04) [^$NU] × $SY", "#LB 14  Do not break after ‘[’, even after spaces.", "14) $OP $SP* ×", "# LB 15  Do not break within ‘\"[’, even with intervening spaces.", "15) $QU $SP* × $OP", "# LB 16  Do not break within ‘]h’, even with intervening spaces.", "16) $CL $SP* × $NS", "# LB 17  Do not break within ‘——’, even with intervening spaces.", "17) $B2 $SP* × $B2", "# LB 18  Break after spaces.", "18) $SP ÷", "# LB 19  Do not break before or after ‘\"’.", "19.01)  × $QU", "19.02) $QU ×", "# LB 20  Break before and after unresolved CB.", "20.01)  ÷ $CB", "20.02) $CB ÷", "# LB 21  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.", "21.01) × $BA", "21.02) × $HY", "21.03) × $NS", "21.04) $BB ×", "# LB 22  Do not break between two ellipses, or between letters or numbers and ellipsis.", "22.01) $AL × $IN", "22.02) $ID × $IN", "22.03) $IN × $IN", "22.04) $NU × $IN", "# LB 23  Do not break within ‘a9’, ‘3a’, or ‘H%’.", "23.01) $ID × $PO", "23.02) $AL × $NU", "23.03) $NU × $AL", "# LB 24  Do not break between prefix and letters or ideographs.", "24.01) $PR × $ID", "24.02) $PR × $AL", "24.03) $PO × $AL", "# Using customization 7", "# LB 18  Do not break between the following pairs of classes.", "# LB 18-alternative: $PR? ( $OP | $HY )? $NU ($NU | $SY | $IS)* $CL? $PO?", "# Insert × every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after ", "25.01) ($PR | $PO) × ( $OP | $HY )? $NU", "25.02) ( $OP | $HY ) × $NU", "25.03) $NU × ($NU | $SY | $IS)", "25.04) $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL)", "25.05) $NU ($NU | $SY | $IS)* $CL? × ($PO | $PR)", "#LB 26 Do not break a Korean syllable.", "26.01) $JL  × $JL | $JV | $H2 | $H3", "26.02) $JV | $H2 × $JV | $JT", "26.03) $JT | $H3 × $JT", "# LB 27 Treat a Korean Syllable Block the same as ID.", "27.01) $JL | $JV | $JT | $H2 | $H3 × $IN", "27.02) $JL | $JV | $JT | $H2 | $H3  × $PO", "27.03) $PR × $JL | $JV | $JT | $H2 | $H3", "# LB 28  Do not break between alphabetics (\"at\").", "28) $AL × $AL", "# LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").", "29) $IS × $AL", "# LB 30  Do not break between letters, numbers or ordinary symbols and opening or closing punctuation.", "30.01) ($AL | $NU) × $OP", "30.02) $CL × ($AL | $NU)"}, new String[]{"SentenceBreak", "# GC stuff", "$GCCR=\\p{Grapheme_Cluster_Break=CR}", "$GCLF=\\p{Grapheme_Cluster_Break=LF}", "$GCControl=\\p{Grapheme_Cluster_Break=Control}", "$GCExtend=\\p{Grapheme_Cluster_Break=Extend}", "# Normal variables", "$Format=\\p{Sentence_Break=Format}", "$Sep=\\p{Sentence_Break=Sep}", "$Sp=\\p{Sentence_Break=Sp}", "$Lower=\\p{Sentence_Break=Lower}", "$Upper=\\p{Sentence_Break=Upper}", "$OLetter=\\p{Sentence_Break=OLetter}", "$Numeric=\\p{Sentence_Break=Numeric}", "$ATerm=\\p{Sentence_Break=ATerm}", "$STerm=\\p{Sentence_Break=STerm}", "$Close=\\p{Sentence_Break=Close}", "$Any=.", "# Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.", "# $ATerm and $Sterm are temporary, to match ICU until UTC decides.", "# WARNING: For Rule 5, now add format and extend to everything but Sep", "$X=[$Format $GCExtend]*", "$Sp=(($Sp | [$Sp - $GCControl] $GCExtend*) $Format*)", "$Lower=($Lower $X)", "$Upper=($Upper $X)", "$OLetter=($OLetter $X)", "$Numeric=($Numeric $X)", "$ATerm=($ATerm $X)", "$STerm=($STerm $X)", "$Close=($Close $X)", "# Do not break within CRLF", "3) $GCCR  \t×  \t$GCLF", "# Break after paragraph separators.", "4) $Sep  \t÷", "# Ignore Format and Extend characters, except when they appear at the beginning of a region of text.", "# (See Section 6.2 Grapheme Cluster and Format Rules.)", "# WARNING: Implemented as don't break before format (except after linebreaks),", "# AND add format and extend in all variables definitions that appear after this point!", "5) [^$Sep] × [$Format $GCExtend]", "# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,", "# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.", "# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.", "6) $ATerm \t× \t$Numeric", "7) $Upper $ATerm \t× \t$Upper", "8) $ATerm $Close* $Sp* \t× \t[^$OLetter $Upper $Lower $Sep]* $Lower", "8.1) ($STerm | $ATerm) $Close* $Sp* \t× \t($STerm | $ATerm)", "#Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.", "9) ( $STerm | $ATerm ) $Close* \t× \t( $Close | $Sp | $Sep )", "# Note the fix to $Sp*, $Sep?", "10) ( $STerm | $ATerm ) $Close* $Sp* \t× \t( $Sp | $Sep )", "11) ( $STerm | $ATerm ) $Close* $Sp* $Sep? ÷", "#Otherwise, do not break", "12) × \t$Any"}, new String[]{"WordBreak", "# GC stuff", "$GCCR=\\p{Grapheme_Cluster_Break=CR}", "$GCLF=\\p{Grapheme_Cluster_Break=LF}", "$GCControl=\\p{Grapheme_Cluster_Break=Control}", "$GCExtend=\\p{Grapheme_Cluster_Break=Extend}", "$Sep=\\p{Sentence_Break=Sep}", "# Now normal variables", "$Format=\\p{Word_Break=Format}", "$Katakana=\\p{Word_Break=Katakana}", "$ALetter=\\p{Word_Break=ALetter}", "$MidLetter=\\p{Word_Break=MidLetter}", "$MidNum=\\p{Word_Break=MidNum}", "$Numeric=\\p{Word_Break=Numeric}", "$ExtendNumLet=\\p{Word_Break=ExtendNumLet}", "# WARNING: For Rule 4: Fixes for GC, Format", "# Add format and extend to everything", "$X=[$Format $GCExtend]*", "$Katakana=($Katakana $X)", "$ALetter=($ALetter $X)", "$MidLetter=($MidLetter $X)", "$MidNum=($MidNum $X)", "$Numeric=($Numeric $X)", "$ExtendNumLet=($ExtendNumLet $X)", "3) $GCCR  \t×  \t$GCLF", "# Ignore Format and Extend characters, except when they appear at the beginning of a region of text.", "# (See Section 6.2 Grapheme Cluster and Format Rules.)", "# WARNING: Implemented as don't break before format (except after linebreaks),", "# AND add format and extend in all variables definitions that appear after this point!", "4) [^ $Sep ] × [$Format $GCExtend]", "# Vanilla rules", "5)$ALetter  \t×  \t$ALetter", "6)$ALetter \t× \t$MidLetter $ALetter", "7)$ALetter $MidLetter \t× \t$ALetter", "8)$Numeric \t× \t$Numeric", "9)$ALetter \t× \t$Numeric", "10)$Numeric \t× \t$ALetter", "11)$Numeric $MidNum \t× \t$Numeric", "12)$Numeric \t× \t$MidNum $Numeric", "13)$Katakana \t× \t$Katakana", "13.1)($ALetter | $Numeric | $Katakana | $ExtendNumLet) \t× \t$ExtendNumLet", "13.2)$ExtendNumLet \t× \t($ALetter | $Numeric | $Katakana)"}};
    }
}
