Removed custom UTF8 encoding implementation, deprecated all its
methods, and ran everything through java's built-in decoder. git-svn-id: http://lampsvn.epfl.ch/svn-repos/scala/scala/trunk@18649 5e8d7ff9-d8ef-0310-90f0-a4852d11357a
This commit is contained in:
parent
d964e3fa56
commit
cbf617572c
|
@ -8,7 +8,7 @@ package scala.tools.nsc
|
|||
package symtab
|
||||
|
||||
import scala.util.NameTransformer
|
||||
import scala.io.UTF8Codec
|
||||
import scala.io.Codec
|
||||
import java.security.MessageDigest
|
||||
|
||||
/** The class <code>Names</code> ...
|
||||
|
@ -87,24 +87,19 @@ class Names {
|
|||
|
||||
private lazy val md5 = MessageDigest.getInstance("MD5")
|
||||
|
||||
private def toMD5(s: String, prefixSuffixLen: Int) = {
|
||||
// println("COMPACTIFY "+s)
|
||||
val cs: Array[Char] = s.toCharArray
|
||||
val bytes = new Array[Byte](cs.length * 4)
|
||||
val len = UTF8Codec.encode(cs, 0, bytes, 0, cs.length)
|
||||
md5.update(bytes, 0, len)
|
||||
val hash = md5.digest()
|
||||
val sb = new StringBuilder
|
||||
sb.appendAll(cs, 0, prefixSuffixLen)
|
||||
sb.append("$$$$")
|
||||
for (i <- 0 until hash.length) {
|
||||
val b = hash(i)
|
||||
sb.append(((b >> 4) & 0xF).toHexString)
|
||||
sb.append((b & 0xF).toHexString)
|
||||
}
|
||||
sb.append("$$$$")
|
||||
sb.appendAll(cs, len - prefixSuffixLen, prefixSuffixLen)
|
||||
sb.toString
|
||||
/** "COMPACTIFY" */
|
||||
private def toMD5(s: String, edge: Int) = {
|
||||
import collection.immutable.StringVector._
|
||||
val prefix = take(s, edge)
|
||||
val suffix = takeRight(s, edge)
|
||||
val marker = "$$$$"
|
||||
|
||||
val cs = s.toArray
|
||||
val bytes = Codec fromUTF8 cs
|
||||
md5 update bytes
|
||||
val md5chars = md5.digest() map (b => (b & 0xFF).toHexString) mkString
|
||||
|
||||
prefix + marker + md5chars + marker + suffix
|
||||
}
|
||||
|
||||
def compactify(s: String): String =
|
||||
|
@ -142,11 +137,8 @@ class Names {
|
|||
* @param len ...
|
||||
* @return the created term name
|
||||
*/
|
||||
def newTermName(bs: Array[Byte], offset: Int, len: Int): Name = {
|
||||
val cs = new Array[Char](bs.length)
|
||||
val nchrs = UTF8Codec.decode(bs, offset, cs, 0, len)
|
||||
newTermName(cs, 0, nchrs)
|
||||
}
|
||||
def newTermName(bs: Array[Byte], offset: Int, len: Int): Name =
|
||||
newTermName(Codec toUTF8 bs.slice(offset, offset + len) mkString)
|
||||
|
||||
/** Create a type name from the characters in <code>cs[offset..offset+len-1]</code>.
|
||||
*
|
||||
|
@ -173,7 +165,6 @@ class Names {
|
|||
def newTypeName(bs: Array[Byte], offset: Int, len: Int): Name =
|
||||
newTermName(bs, offset, len).toTypeName
|
||||
|
||||
|
||||
def nameChars: Array[Char] = chrs
|
||||
|
||||
implicit def view(s: String): Name = newTermName(s)
|
||||
|
@ -228,8 +219,11 @@ class Names {
|
|||
* Array must have enough remaining space for all bytes
|
||||
* (i.e. maximally 3*length bytes).
|
||||
*/
|
||||
final def copyUTF8(bs: Array[Byte], offset: Int): Int =
|
||||
UTF8Codec.encode(chrs, index, bs, offset, len)
|
||||
final def copyUTF8(bs: Array[Byte], offset: Int): Int = {
|
||||
val bytes = Codec fromUTF8 chrs.slice(index, index + len)
|
||||
compat.Platform.arraycopy(bytes, 0, bs, offset, bytes.length)
|
||||
offset + bytes.length
|
||||
}
|
||||
|
||||
/** return the hash value of this name
|
||||
*/
|
||||
|
|
|
@ -12,7 +12,6 @@ import java.io.IOException
|
|||
import java.lang.{Float, Double}
|
||||
|
||||
import scala.tools.nsc.util.{Position, NoPosition}
|
||||
import scala.io.UTF8Codec
|
||||
|
||||
import Flags._
|
||||
import PickleFormat._
|
||||
|
|
|
@ -271,8 +271,11 @@ object BytePickle {
|
|||
(s.stream(0), new UnPicklerState(s.stream.slice(1, s.stream.length), s.dict));
|
||||
}
|
||||
|
||||
def string: SPU[String] =
|
||||
share(wrap((a: Array[Byte]) => UTF8Codec.decode(a, 0, a.length), (s:String) => UTF8Codec.encode(s), bytearray));
|
||||
def string: SPU[String] = share(wrap(
|
||||
(a: Array[Byte]) => Codec toUTF8 a mkString,
|
||||
(s: String) => Codec fromUTF8 s,
|
||||
bytearray
|
||||
))
|
||||
|
||||
def bytearray: SPU[Array[Byte]] = {
|
||||
wrap((l:List[Byte]) => l.toArray, (_.toList), list(byte))
|
||||
|
|
|
@ -83,6 +83,24 @@ object Codec {
|
|||
new Codec(decoder.charset()) { override def decoder = _decoder }
|
||||
}
|
||||
|
||||
def toUTF8(bytes: Array[Byte]): Array[Char] = {
|
||||
val bbuffer = java.nio.ByteBuffer wrap bytes
|
||||
val cbuffer = UTF8 decode bbuffer
|
||||
val chars = new Array[Char](cbuffer.remaining())
|
||||
cbuffer get chars
|
||||
|
||||
chars
|
||||
}
|
||||
|
||||
def fromUTF8(cs: CharSequence): Array[Byte] = {
|
||||
val cbuffer = java.nio.CharBuffer wrap cs
|
||||
val bbuffer = UTF8 encode cbuffer
|
||||
val bytes = new Array[Byte](bbuffer.remaining())
|
||||
bbuffer get bytes
|
||||
|
||||
bytes
|
||||
}
|
||||
|
||||
implicit def string2codec(s: String) = apply(s)
|
||||
implicit def charset2codec(c: Charset) = apply(c)
|
||||
implicit def decoder2codec(cd: CharsetDecoder) = apply(cd)
|
||||
|
|
|
@ -8,112 +8,58 @@
|
|||
|
||||
// $Id$
|
||||
|
||||
|
||||
package scala.io
|
||||
|
||||
/**
|
||||
* @author Martin Odersky
|
||||
* @version 1.0, 04/10/2004
|
||||
*/
|
||||
object UTF8Codec {
|
||||
|
||||
object UTF8Codec
|
||||
{
|
||||
final val UNI_REPLACEMENT_CHAR: Int = 0x0000FFFD
|
||||
final val UNI_REPLACEMENT_BYTES = encode(UNI_REPLACEMENT_CHAR)
|
||||
|
||||
// Note, from http://unicode.org/faq/utf_bom.html#utf8-5
|
||||
//
|
||||
// A different issue arises if an unpaired surrogate is encountered when converting
|
||||
// ill-formed UTF-16 data. By represented such an unpaired surrogate on its own as a
|
||||
// 3-byte sequence, the resulting UTF-8 data stream would become ill-formed.
|
||||
// While it faithfully reflects the nature of the input, Unicode conformance
|
||||
// requires that encoding form conversion always results in valid data stream.
|
||||
// Therefore a converter must treat this as an error.
|
||||
//
|
||||
// Some useful locations:
|
||||
// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
|
||||
/** convert a codepoint to utf-8 bytes
|
||||
* @author buraq
|
||||
* @param ch codepoint
|
||||
*/
|
||||
def encode(ch1: Int): Array[Byte] = {
|
||||
var ch = ch1
|
||||
val byteMask = 0xBF
|
||||
val byteMark = 0x80
|
||||
var bytesToWrite = 0
|
||||
val firstByteMark = List[Byte](0x00.toByte, 0x00.toByte, 0xC0.toByte, 0xE0.toByte, 0xF0.toByte, 0xF8.toByte, 0xFC.toByte)
|
||||
|
||||
if (ch < 0x80) { bytesToWrite = 1 }
|
||||
else if (ch < 0x800) { bytesToWrite = 2 }
|
||||
else if (ch < 0x10000) { bytesToWrite = 3 }
|
||||
else if (ch <= 0x0010FFFF) { bytesToWrite = 4 }
|
||||
else return encode(UNI_REPLACEMENT_CHAR)
|
||||
|
||||
val res = new Array[Byte](bytesToWrite)
|
||||
|
||||
var bw = bytesToWrite
|
||||
if (bw >= 4) {
|
||||
res(3) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
|
||||
}
|
||||
if (bw >= 3) {
|
||||
res(2) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
|
||||
}
|
||||
if (bw >= 2) {
|
||||
res(1) = ((ch | byteMark) & byteMask).toByte; ch = ch >> 6; bw -= 1
|
||||
}
|
||||
if (bw >= 1) {
|
||||
res(0) = (ch | firstByteMark(bytesToWrite)).toByte
|
||||
}
|
||||
res
|
||||
}
|
||||
@deprecated("""Use new String(Array(ch), 0, 1).getBytes("UTF-8") instead""")
|
||||
def encode(ch: Int): Array[Byte] =
|
||||
if ((Character getType ch) == Character.SURROGATE) UNI_REPLACEMENT_BYTES
|
||||
else try new String(Array(ch), 0, 1) getBytes "UTF-8" catch {
|
||||
case _: IllegalArgumentException => UNI_REPLACEMENT_BYTES
|
||||
}
|
||||
|
||||
@deprecated("Use Codec.fromUTF8 instead")
|
||||
def encode(src: Array[Char], from: Int, dst: Array[Byte], to: Int, len: Int): Int = {
|
||||
var i = from
|
||||
var j = to
|
||||
val end = from + len
|
||||
while (i < end) {
|
||||
val ch = src(i)
|
||||
i += 1
|
||||
if (ch < 128) {
|
||||
dst(j) = ch.toByte
|
||||
j += 1
|
||||
}
|
||||
else if (ch <= 0x3FF) {
|
||||
dst(j) = (0xC0 | (ch >> 6)).toByte
|
||||
dst(j+1) = (0x80 | (ch & 0x3F)).toByte
|
||||
j += 2
|
||||
} else {
|
||||
dst(j) = (0xE0 | (ch >> 12)).toByte
|
||||
dst(j+1) = (0x80 | ((ch >> 6) & 0x3F)).toByte
|
||||
dst(j+2) = (0x80 | (ch & 0x3F)).toByte
|
||||
j += 3
|
||||
}
|
||||
}
|
||||
j
|
||||
val bytes = Codec fromUTF8 src.slice(from, from + len)
|
||||
Array.copy(bytes, 0, dst, to, bytes.length)
|
||||
bytes.length
|
||||
}
|
||||
|
||||
@deprecated("Use Codec.fromUTF8 instead")
|
||||
def encode(s: String, dst: Array[Byte], to: Int): Int =
|
||||
encode(s.toCharArray(), 0, dst, to, s.length())
|
||||
encode(s.toArray, 0, dst, to, s.length)
|
||||
|
||||
def encode(s: String): Array[Byte] = {
|
||||
val dst = new Array[Byte](s.length() * 3)
|
||||
val len = encode(s, dst, 0)
|
||||
dst.slice(0, len)
|
||||
@deprecated("Use Codec.fromUTF8 instead")
|
||||
def encode(s: String): Array[Byte] = Codec fromUTF8 s
|
||||
|
||||
@deprecated("Use Codec.toUTF8 instead")
|
||||
def decode(src: Array[Byte], from: Int, dst: Array[Char], to: Int, len: Int): Int = {
|
||||
val chars = Codec toUTF8 src.slice(from, from + len)
|
||||
Array.copy(chars, 0, dst, to, chars.length)
|
||||
chars.length
|
||||
}
|
||||
|
||||
def decode(src: Array[Byte], from: Int,
|
||||
dst: Array[Char], to: Int, len: Int): Int =
|
||||
{
|
||||
var i = from
|
||||
var j = to
|
||||
val end = from + len
|
||||
while (i < end) {
|
||||
var b = src(i) & 0xFF
|
||||
i += 1
|
||||
if (b >= 0xE0) {
|
||||
b = ((b & 0x0F) << 12) | (src(i) & 0x3F) << 6
|
||||
b = b | (src(i+1) & 0x3F)
|
||||
i += 2
|
||||
} else if (b >= 0xC0) {
|
||||
b = ((b & 0x1F) << 6) | (src(i) & 0x3F)
|
||||
i += 1
|
||||
}
|
||||
dst(j) = b.toChar
|
||||
j += 1
|
||||
}
|
||||
j
|
||||
}
|
||||
|
||||
def decode(src: Array[Byte], from: Int, len: Int): String = {
|
||||
val cs = new Array[Char](len)
|
||||
new String(cs, 0, decode(src, from, cs, 0, len))
|
||||
}
|
||||
|
||||
}
|
||||
@deprecated("Use Codec.toUTF8 instead")
|
||||
def decode(src: Array[Byte], from: Int, len: Int): String =
|
||||
Codec toUTF8 src.slice(from, from + len) mkString
|
||||
}
|
|
@ -414,7 +414,6 @@ object Utility extends AnyRef with parsing.TokenTests
|
|||
}
|
||||
nextch()
|
||||
}
|
||||
new String(io.UTF8Codec.encode(i), "utf8")
|
||||
new String(Array(i), 0, 1)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue