Normalization fixes

This commit is contained in:
Jari Bakken 2010-05-14 02:19:50 +02:00
parent 2c3ec3d6e9
commit 2924ebef0e
5 changed files with 41 additions and 17 deletions

View File

@ -14,6 +14,10 @@ module ICU
os
end
end
def self.ruby19?
RUBY_VERSION >= '1.9'
end
end
require "ffi-icu/lib"
@ -23,3 +27,7 @@ require "ffi-icu/collation"
require "ffi-icu/transliteration"
require "ffi-icu/normalization"
unless ICU.ruby19?
require 'jcode'
$KCODE = 'u'
end

View File

@ -81,7 +81,8 @@ module ICU
attach_function :uenum_count, "uenum_count#{suffix}", [:pointer, :pointer], :int
attach_function :uenum_close, "uenum_close#{suffix}", [:pointer], :void
attach_function :uenum_next, "uenum_next#{suffix}", [:pointer, :pointer, :pointer], :string
attach_function :u_charsToUChars, "u_charsToUChars#{suffix}", [:string, :pointer, :int32], :void
attach_function :u_UCharsToChars, "u_UCharsToChars#{suffix}", [:pointer, :string, :int32], :void
# CharDet
#
@ -153,9 +154,9 @@ module ICU
#
enum :normalization_mode, [ :none, 1,
:ndf, 2,
:nfd, 2,
:nfkd, 3,
:nfd, 4,
:nfc, 4,
:default, 4,
:nfkc, 5,
:fcd, 6

View File

@ -1,25 +1,28 @@
module ICU
module Normalization
def self.normalize(str, mode = :default)
needed = 0
options = 0
def self.normalize(input, mode = :default)
input_length = ICU.ruby19? ? input.length : input.jlength
needed_length = 0
result_length = 0
retried = false
ptr = nil
begin
Lib.check_error do |error|
needed = Lib.unorm_normalize(UCharPointer.from_string(str), str.length, mode, options, ptr, needed, error)
needed_length = Lib.unorm_normalize(UCharPointer.from_string(input), input_length, mode, 0, ptr, result_length, error)
end
rescue BufferOverflowError
raise if retried
ptr = UCharPointer.from_string("\0" * needed_length)
result_length = needed_length + 1
ptr = UCharPointer.from_string(' '*needed)
retried = true
retry
end
ptr.string
ptr.string if ptr
end
end # Normalization

View File

@ -1,15 +1,22 @@
module ICU
class UCharPointer < FFI::MemoryPointer
UCHAR_TYPE = :uint16 # not sure how platform-dependent this is..
def self.from_string(str)
# not sure how this will work with other encodings
str = str.encode("UTF-8") if str.respond_to? :encode
super str.unpack("U*").pack("L*")
bytes = str.unpack("U*")
ptr = new UCHAR_TYPE, bytes.size
ptr.put_array_of_uint16 0, bytes
ptr
end
def string
wstring = read_string(size)
wstring.unpack("L*").pack("U*")
wstring = get_array_of_uint16(0, size / FFI.type_size(UCHAR_TYPE))
wstring.pack("U*")
end
end # UCharPointer
end # ICU

View File

@ -4,11 +4,16 @@ require 'spec_helper'
module ICU
module Normalization
# http://bugs.icu-project.org/trac/browser/icu/trunk/source/test/cintltst/cnormtst.c
describe "Normalization" do
it "should normalize a string" do
# not sure if this expectation is correct
ICU::Normalization.normalize("æåø").should == "aeao"
it "should normalize a string - decomposed" do
ICU::Normalization.normalize("Å", :nfd).unpack("U*").should == [65, 778]
end
it "should normalize a string - composed" do
ICU::Normalization.normalize("Å", :nfc).unpack("U*").should == [197]
end