File.basename should support windows1250#3877
Conversation
|
@headius could you take a look? |
|
@ahorek Sure, looking now. |
|
This is just a test, right? Or do you have a patch coming? |
|
Ok, the issue here is that we do most of our path-munging using a UTF-16 Java string, and then when we're done we need to transcode back into whatever the original encoding was. However Java does not have a transcoder for Windows-1250. I will try to modify this to just use Ruby transcoding. |
|
Here's a patch I believe solves the issue. Gotta run, though...maybe @ahorek can confirm and review it for me? diff --git a/core/src/main/java/org/jruby/RubyFile.java b/core/src/main/java/org/jruby/RubyFile.java
index c5efbcd..268c8bb 100644
--- a/core/src/main/java/org/jruby/RubyFile.java
+++ b/core/src/main/java/org/jruby/RubyFile.java
@@ -518,15 +518,7 @@ public class RubyFile extends RubyIO implements EncodingCapable {
case 2:
return RubyString.newEmptyString(runtime, origString.getEncoding()).infectBy(args[0]);
case 3:
- if (origEncoding.getCharset() != null) {
- try {
- return RubyString.newString(runtime, new ByteList(name.substring(2).getBytes(origEncoding.getCharsetName()), origString.getEncoding())).infectBy(args[0]);
- } catch (UnsupportedEncodingException uee) {
- // fall through to UTF-8 logic
- }
- }
-
- return RubyString.newString(runtime, name.substring(2)).infectBy(args[0]);
+ return RubyString.newString(runtime, RubyString.encodeBytelist(name.substring(2), origEncoding));
default:
switch (name.charAt(2)) {
case '/':
@@ -581,15 +573,8 @@ public class RubyFile extends RubyIO implements EncodingCapable {
name = name.substring(0, name.length() - ext.length());
}
}
- if (origEncoding.getCharset() != null) {
- try {
- return RubyString.newString(runtime, new ByteList(name.getBytes(origEncoding.getCharsetName()), origString.getEncoding())).infectBy(args[0]);
- } catch (UnsupportedEncodingException uee) {
- // fall through to UTF-8 logic
- }
- }
- return RubyString.newString(runtime, name).infectBy(args[0]);
+ return RubyString.newString(runtime, RubyString.encodeBytelist(name, origEncoding));
}
@JRubyMethod(required = 2, rest = true, meta = true)
@@ -1370,8 +1355,7 @@ public class RubyFile extends RubyIO implements EncodingCapable {
pathEncoding != encodingService.getAscii8bitEncoding() &&
pathEncoding != encodingService.getFileSystemEncoding(runtime) &&
!path.isAsciiOnly()) {
- ByteList bytes = EncodingUtils.strConvEnc(context, path.getByteList(), pathEncoding, encodingService.getFileSystemEncoding(runtime));
- path = RubyString.newString(runtime, bytes);
+ path = EncodingUtils.strConvEnc(context, path, pathEncoding, encodingService.getFileSystemEncoding(runtime));
}
}
diff --git a/core/src/main/java/org/jruby/RubyString.java b/core/src/main/java/org/jruby/RubyString.java
index 2dd0dd2..5e68938 100644
--- a/core/src/main/java/org/jruby/RubyString.java
+++ b/core/src/main/java/org/jruby/RubyString.java
@@ -5463,8 +5463,10 @@ public class RubyString extends RubyObject implements EncodingCapable, MarshalEn
Charset charset = encoding.getCharset();
- // if null charset, fall back on Java default charset
- if (charset == null) charset = Charset.defaultCharset();
+ // if null charset, let our transcoder handle it
+ if (charset == null) {
+ return EncodingUtils.transcodeString(value.toString(), encoding, 0);
+ }
byte[] bytes;
if (charset == RubyEncoding.UTF8) {
diff --git a/core/src/main/java/org/jruby/ext/stringio/StringIO.java b/core/src/main/java/org/jruby/ext/stringio/StringIO.java
index 386b519..df0ebc1 100644
--- a/core/src/main/java/org/jruby/ext/stringio/StringIO.java
+++ b/core/src/main/java/org/jruby/ext/stringio/StringIO.java
@@ -1007,7 +1007,7 @@ public class StringIO extends RubyObject implements EncodingCapable {
if (enc != encStr && enc != EncodingUtils.ascii8bitEncoding(runtime)
// this is a hack because we don't seem to handle incoming ASCII-8BIT properly in transcoder
&& encStr != ASCIIEncoding.INSTANCE) {
- str = runtime.newString(EncodingUtils.strConvEnc(context, strByteList, encStr, enc));
+ str = EncodingUtils.strConvEnc(context, str, encStr, enc);
}
len = str.size();
if (len == 0) return RubyFixnum.zero(runtime);
diff --git a/core/src/main/java/org/jruby/util/io/EncodingUtils.java b/core/src/main/java/org/jruby/util/io/EncodingUtils.java
index 62e29ce..b898ce8 100644
--- a/core/src/main/java/org/jruby/util/io/EncodingUtils.java
+++ b/core/src/main/java/org/jruby/util/io/EncodingUtils.java
@@ -3,6 +3,7 @@ package org.jruby.util.io;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.Ptr;
+import org.jcodings.exception.TranscoderException;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.USASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
@@ -1251,6 +1252,67 @@ public class EncodingUtils {
}
}
+ /**
+ * A version of transcodeLoop for working without any Ruby runtime available.
+ *
+ * MRI: transcode_loop minus fallback and any other Ruby bits
+ */
+ public static ByteList transcodeString(String string, Encoding toEncoding, int ecflags) {
+ Encoding encoding;
+
+ // This may be inefficient if we aren't matching endianness right
+ if (Platform.BYTE_ORDER == Platform.LITTLE_ENDIAN) {
+ encoding = UTF16LEEncoding.INSTANCE;
+ } else {
+ encoding = UTF16BEEncoding.INSTANCE;
+ }
+
+ EConv ec = TranscoderDB.open(encoding.getName(), toEncoding.getName(), ecflags);
+
+ byte[] bytes = string.getBytes(encoding.getCharset());
+ ByteList sp = new ByteList(bytes, encoding, false);
+ ByteList fromp = sp;
+ int slen = sp.realSize();
+ // most encodings will be shorter than UTF-16 for typical input
+ int blen = (int)((double)slen / 1.5 + 1);
+ ByteList destp = new ByteList(blen);
+ destp.setEncoding(toEncoding);
+
+ byte[] frompBytes = fromp.unsafeBytes();
+ byte[] destpBytes = destp.unsafeBytes();
+ Ptr frompPos = new Ptr(fromp.getBegin());
+ Ptr destpPos = new Ptr(destp.getBegin());
+ Ptr outStop = new Ptr(blen);
+
+ Transcoding lastTC = ec.lastTranscoding;
+ int maxOutput = lastTC != null ? lastTC.transcoder.maxOutput : 1;
+
+ Ptr outStart = new Ptr(0);
+
+ // resume:
+ while (true) {
+ EConvResult ret = ec.convert(frompBytes, frompPos, frompBytes.length, destpBytes, destpPos, outStop.p, 0);
+
+ if (ret == EConvResult.InvalidByteSequence ||
+ ret == EConvResult.IncompleteInput ||
+ ret == EConvResult.UndefinedConversion) {
+ TranscoderException re = new TranscoderException(ret.name());
+ ec.close();
+ throw re;
+ }
+
+ if (ret == EConvResult.DestinationBufferFull) {
+ moreOutputBuffer(destp, strTranscodingResize, maxOutput, outStart, destpPos, outStop);
+ destpBytes = destp.getUnsafeBytes();
+ continue;
+ }
+
+ ec.close();
+
+ return destp;
+ }
+ }
+
// make_econv_exception
public static RaiseException makeEconvException(Ruby runtime, EConv ec) {
final StringBuilder mesg = new StringBuilder(); RaiseException exc; |
|
Sorry, the StringIO and RubyFile parts of this are unrelated. |
|
@headius if this end up being reasonable to backport along with you expand_path fix then we should. |
|
@headius it works fine with your modifications of RubyFile.java, thanks |
In prep for a String-based transcode to address the test in #3877.
Previously, if the target encoding was not supported by the JDK, we would be unable to encode the string and would just make it UTF-8. Now if there's no JDK support we will fall back on joni transcoding. Part of #3877 work.
|
@ahorek I have pushed my work to master and turned your test into a spec for https://github.com/ruby/spec. @enebo The tricky bit is that none of this EncodingUtils really exists in 1.7. THIS work could be the first opportunity to start using joni transcoding in 1.7, but I don't know if that's a slope down which we want to start slipping. The other fix can probably be ported without issues. |
|
@headius yeah I guess that sounds like it might not be worth the potential pain. For people who need properly encoded paths we will just strongly encourage using 9k now. |
since #3871
File.basename doesn't seem to handle the Windows-1250 encoding properly
mri
jruby
https://github.com/jruby/jruby/blob/master/core/src/main/java/org/jruby/RubyFile.java#L584
causes a bundler failure once again