Skip to content

Non-ascii method names getting kicked out #2698

@headius

Description

@headius

This test in MRI's test_m17n.rb fails because we kick the method name out. It appears that our logic for checking identifiers is not correct, and RubyLexer.identifier (and down stream) need updating.

TestM17N#test_nonascii_method_name:
Java::OrgJcodingsException::EncodingException: invalid code point value
    org.jcodings.specific.BaseEUCJPEncoding.codeToMbcLength(BaseEUCJPEncoding.java:55)
    org.jcodings.specific.EUCJPEncoding.codeToMbcLength(EUCJPEncoding.java:24)
    org.jruby.lexer.yacc.RubyLexer.isMultiByteChar(RubyLexer.java:638)
    org.jruby.lexer.yacc.RubyLexer.getIdentifier(RubyLexer.java:1365)
    org.jruby.lexer.yacc.RubyLexer.identifier(RubyLexer.java:1779)

I started a patch to do the re-porting, but I'm not familiar enough with the lexer to complete it.

diff --git a/core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java b/core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java
index fa26107..4db7e14 100644
--- a/core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java
+++ b/core/src/main/java/org/jruby/lexer/yacc/RubyLexer.java
@@ -621,7 +621,7 @@ public class RubyLexer {
      * mri: is_identchar
      */
     public boolean isIdentifierChar(int c) {
-        return Character.isLetterOrDigit(c) || c == '_' || isMultiByteChar(c);
+        return encoding.isAlnum(c) || c == '_' || !Encoding.isAscii(c);
     }

     public boolean isASCII(int c) {
@@ -1768,6 +1768,20 @@ public class RubyLexer {
     }

     private int identifier(int c, boolean commandState) throws IOException {
+        ByteList aggregate = new ByteList();
+        do {
+            if (!Encoding.isAscii(c)) mb = ENC_CODERANGE_UNKNOWN;
+            if (tokenAddMBC(c, aggregate) == -1) return 0;
+            c = nextToken();
+        } while (isIdentifierChar(c));
+
+        int result = 0;
+
+        if (c == '!' || c == '?') {
+            result = Tokens.tFID;
+        } else {
+            this
+        }
         if (!isIdentifierChar(c)) {
             String badChar = "\\" + Integer.toOctalString(c & 0xff);
             throw new SyntaxException(PID.CHARACTER_BAD, getPosition(), getCurrentLine(),
@@ -1791,8 +1805,7 @@ public class RubyLexer {
         } else {
             src.unread(c);
         }
-        
-        int result = 0;
+

         last_state = lex_state;
         if (lastBangOrPredicate) {

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions