mirror of https://github.com/erickguan/ffi-icu.git
Normalization fixes
This commit is contained in:
parent
2c3ec3d6e9
commit
2924ebef0e
|
@ -14,6 +14,10 @@ module ICU
|
|||
os
|
||||
end
|
||||
end
|
||||
|
||||
def self.ruby19?
|
||||
RUBY_VERSION >= '1.9'
|
||||
end
|
||||
end
|
||||
|
||||
require "ffi-icu/lib"
|
||||
|
@ -23,3 +27,7 @@ require "ffi-icu/collation"
|
|||
require "ffi-icu/transliteration"
|
||||
require "ffi-icu/normalization"
|
||||
|
||||
unless ICU.ruby19?
|
||||
require 'jcode'
|
||||
$KCODE = 'u'
|
||||
end
|
||||
|
|
|
@ -81,7 +81,8 @@ module ICU
|
|||
attach_function :uenum_count, "uenum_count#{suffix}", [:pointer, :pointer], :int
|
||||
attach_function :uenum_close, "uenum_close#{suffix}", [:pointer], :void
|
||||
attach_function :uenum_next, "uenum_next#{suffix}", [:pointer, :pointer, :pointer], :string
|
||||
|
||||
attach_function :u_charsToUChars, "u_charsToUChars#{suffix}", [:string, :pointer, :int32], :void
|
||||
attach_function :u_UCharsToChars, "u_UCharsToChars#{suffix}", [:pointer, :string, :int32], :void
|
||||
|
||||
# CharDet
|
||||
#
|
||||
|
@ -153,9 +154,9 @@ module ICU
|
|||
#
|
||||
|
||||
enum :normalization_mode, [ :none, 1,
|
||||
:ndf, 2,
|
||||
:nfd, 2,
|
||||
:nfkd, 3,
|
||||
:nfd, 4,
|
||||
:nfc, 4,
|
||||
:default, 4,
|
||||
:nfkc, 5,
|
||||
:fcd, 6
|
||||
|
|
|
@ -1,25 +1,28 @@
|
|||
module ICU
|
||||
module Normalization
|
||||
|
||||
def self.normalize(str, mode = :default)
|
||||
needed = 0
|
||||
options = 0
|
||||
def self.normalize(input, mode = :default)
|
||||
input_length = ICU.ruby19? ? input.length : input.jlength
|
||||
needed_length = 0
|
||||
result_length = 0
|
||||
|
||||
retried = false
|
||||
ptr = nil
|
||||
|
||||
begin
|
||||
Lib.check_error do |error|
|
||||
needed = Lib.unorm_normalize(UCharPointer.from_string(str), str.length, mode, options, ptr, needed, error)
|
||||
needed_length = Lib.unorm_normalize(UCharPointer.from_string(input), input_length, mode, 0, ptr, result_length, error)
|
||||
end
|
||||
rescue BufferOverflowError
|
||||
raise if retried
|
||||
ptr = UCharPointer.from_string("\0" * needed_length)
|
||||
result_length = needed_length + 1
|
||||
|
||||
ptr = UCharPointer.from_string(' '*needed)
|
||||
retried = true
|
||||
retry
|
||||
end
|
||||
|
||||
ptr.string
|
||||
ptr.string if ptr
|
||||
end
|
||||
|
||||
end # Normalization
|
||||
|
|
|
@ -1,15 +1,22 @@
|
|||
module ICU
|
||||
class UCharPointer < FFI::MemoryPointer
|
||||
|
||||
UCHAR_TYPE = :uint16 # not sure how platform-dependent this is..
|
||||
|
||||
def self.from_string(str)
|
||||
# not sure how this will work with other encodings
|
||||
str = str.encode("UTF-8") if str.respond_to? :encode
|
||||
super str.unpack("U*").pack("L*")
|
||||
bytes = str.unpack("U*")
|
||||
|
||||
ptr = new UCHAR_TYPE, bytes.size
|
||||
ptr.put_array_of_uint16 0, bytes
|
||||
|
||||
ptr
|
||||
end
|
||||
|
||||
def string
|
||||
wstring = read_string(size)
|
||||
wstring.unpack("L*").pack("U*")
|
||||
wstring = get_array_of_uint16(0, size / FFI.type_size(UCHAR_TYPE))
|
||||
wstring.pack("U*")
|
||||
end
|
||||
|
||||
end # UCharPointer
|
||||
end # ICU
|
||||
|
|
|
@ -4,11 +4,16 @@ require 'spec_helper'
|
|||
|
||||
module ICU
|
||||
module Normalization
|
||||
# http://bugs.icu-project.org/trac/browser/icu/trunk/source/test/cintltst/cnormtst.c
|
||||
|
||||
describe "Normalization" do
|
||||
|
||||
it "should normalize a string" do
|
||||
# not sure if this expectation is correct
|
||||
ICU::Normalization.normalize("æåø").should == "aeao"
|
||||
it "should normalize a string - decomposed" do
|
||||
ICU::Normalization.normalize("Å", :nfd).unpack("U*").should == [65, 778]
|
||||
end
|
||||
|
||||
it "should normalize a string - composed" do
|
||||
ICU::Normalization.normalize("Å", :nfc).unpack("U*").should == [197]
|
||||
end
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue