mirror of https://github.com/erickguan/ffi-icu.git
Implement more of the API
This commit is contained in:
parent
3b96bf3c73
commit
47b366a11d
|
@ -32,6 +32,11 @@ I've tested this on these platforms:
|
|||
- Debian Linux
|
||||
- Arch Linux
|
||||
|
||||
and these rubies:
|
||||
|
||||
- MRI 1.9.1
|
||||
- MRI 1.8.7
|
||||
|
||||
YMMV.
|
||||
|
||||
== Note on Patches/Pull Requests
|
||||
|
|
12
Rakefile
12
Rakefile
|
@ -4,14 +4,14 @@ require 'rake'
|
|||
begin
|
||||
require 'jeweler'
|
||||
Jeweler::Tasks.new do |gem|
|
||||
gem.name = "icu-chardet-ffi"
|
||||
gem.summary = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
|
||||
gem.name = "icu-chardet-ffi"
|
||||
gem.summary = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
|
||||
gem.description = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
|
||||
gem.email = "jari.bakken@gmail.com"
|
||||
gem.homepage = "http://github.com/jarib/icu-chardet-ffi"
|
||||
gem.authors = ["Jari Bakken"]
|
||||
gem.email = "jari.bakken@gmail.com"
|
||||
gem.homepage = "http://github.com/jarib/icu-chardet-ffi"
|
||||
gem.authors = ["Jari Bakken"]
|
||||
|
||||
gem.add_dependency "ffi", "0.6.3"
|
||||
gem.add_dependency "ffi", ">= 0.6.3"
|
||||
gem.add_development_dependency "rspec", ">= 1.3.0"
|
||||
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
||||
end
|
||||
|
|
|
@ -17,13 +17,22 @@ module ICU
|
|||
suffix = '_44'
|
||||
end
|
||||
|
||||
#
|
||||
# http://icu-project.org/apiref/icu4c/ucsdet_8h.html
|
||||
#
|
||||
|
||||
attach_function "ucsdet_open#{suffix}", [:pointer], :pointer
|
||||
attach_function "ucsdet_close#{suffix}", [:pointer], :void
|
||||
attach_function "ucsdet_setText#{suffix}", [:pointer, :string, :int, :pointer], :void
|
||||
attach_function "ucsdet_setDeclaredEncoding#{suffix}", [:pointer, :string, :int, :pointer], :void
|
||||
attach_function "ucsdet_detect#{suffix}", [:pointer, :pointer], :pointer
|
||||
attach_function "ucsdet_detectAll#{suffix}", [:pointer, :pointer, :pointer], :pointer
|
||||
attach_function "ucsdet_getName#{suffix}", [:pointer, :pointer], :string
|
||||
attach_function "ucsdet_getConfidence#{suffix}", [:pointer, :pointer], :int
|
||||
attach_function "ucsdet_getLanguage#{suffix}", [:pointer, :pointer], :string
|
||||
attach_function "ucsdet_getAllDetectableCharsets#{suffix}", [:pointer, :pointer], :pointer
|
||||
attach_function "ucsdet_isInputFilterEnabled#{suffix}", [:pointer], :bool
|
||||
attach_function "ucsdet_enableInputFilter#{suffix}", [:pointer, :bool], :bool
|
||||
attach_function "u_errorName#{suffix}", [:int], :string
|
||||
attach_function "uenum_count#{suffix}", [:pointer, :pointer], :int
|
||||
attach_function "uenum_close#{suffix}", [:pointer], :void
|
||||
|
@ -38,7 +47,14 @@ module ICU
|
|||
alias_method :ucsdet_detect, "ucsdet_detect#{suffix}"
|
||||
alias_method :ucsdet_getName, "ucsdet_getName#{suffix}"
|
||||
alias_method :ucsdet_getConfidence, "ucsdet_getConfidence#{suffix}"
|
||||
alias_method :ucsdet_getLanguage, "ucsdet_getLanguage#{suffix}"
|
||||
alias_method :ucsdet_isInputFilterEnabled, "ucsdet_isInputFilterEnabled#{suffix}"
|
||||
alias_method :ucsdet_enableInputFilter, "ucsdet_enableInputFilter#{suffix}"
|
||||
alias_method :ucsdet_setDeclaredEncoding, "ucsdet_setDeclaredEncoding#{suffix}"
|
||||
alias_method :u_errorName, "u_errorName#{suffix}"
|
||||
alias_method :uenum_count, "uenum_count#{suffix}"
|
||||
alias_method :uenum_count, "uenum_close#{suffix}"
|
||||
alias_method :uenum_count, "uenum_next#{suffix}"
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -51,7 +67,7 @@ module ICU
|
|||
end
|
||||
|
||||
class Detector
|
||||
Match = Struct.new(:name, :confidence)
|
||||
Match = Struct.new(:name, :confidence, :language)
|
||||
|
||||
def initialize
|
||||
check_status do |ptr|
|
||||
|
@ -59,31 +75,50 @@ module ICU
|
|||
end
|
||||
end
|
||||
|
||||
def input_filter_enabled?
|
||||
CharDet.ucsdet_isInputFilterEnabled @detector
|
||||
end
|
||||
|
||||
def input_filter_enabled=(bool)
|
||||
CharDet.ucsdet_enableInputFilter(@detector, !!bool)
|
||||
end
|
||||
|
||||
def declared_encoding=(str)
|
||||
check_status do |ptr|
|
||||
CharDet.ucsdet_setDeclaredEncoding(@detector, str, str.bytesize, ptr)
|
||||
end
|
||||
end
|
||||
|
||||
def close
|
||||
CharDet.ucsdet_close @detector
|
||||
end
|
||||
|
||||
def detect(str)
|
||||
check_status do |ptr|
|
||||
CharDet.ucsdet_setText(@detector, str, str.bytesize, ptr)
|
||||
end
|
||||
set_text(str)
|
||||
|
||||
match_ptr = nil
|
||||
|
||||
check_status do |ptr|
|
||||
match_ptr = CharDet.ucsdet_detect(@detector, ptr)
|
||||
end
|
||||
|
||||
result = Match.new
|
||||
check_status do |ptr|
|
||||
result.name = CharDet.ucsdet_getName(match_ptr, ptr)
|
||||
match_ptr_to_ruby(match_ptr) unless match_ptr.null?
|
||||
end
|
||||
|
||||
def detect_all(str)
|
||||
set_text(str)
|
||||
|
||||
matches_found_ptr = FFI::MemoryPointer.new :int
|
||||
array_ptr = nil
|
||||
|
||||
check_status do |status|
|
||||
array_ptr = CharDet.ucsdet_detectAll(@detector, matches_found_ptr, status)
|
||||
end
|
||||
|
||||
check_status do |ptr|
|
||||
result.confidence = CharDet.ucsdet_getConfidence(match_ptr, ptr)
|
||||
end
|
||||
length = matches_found_ptr.read_int
|
||||
|
||||
result
|
||||
array_ptr.read_array_of_pointer(length).map do |match|
|
||||
match_ptr_to_ruby(match)
|
||||
end
|
||||
end
|
||||
|
||||
def detectable_charsets
|
||||
|
@ -93,7 +128,7 @@ module ICU
|
|||
enum_ptr = CharDet.ucsdet_getAllDetectableCharsets(@detector, ptr)
|
||||
end
|
||||
|
||||
result = enumeration_to_array(enum_ptr)
|
||||
result = enum_ptr_to_array(enum_ptr)
|
||||
CharDet.uenum_close(enum_ptr)
|
||||
|
||||
result
|
||||
|
@ -112,20 +147,44 @@ module ICU
|
|||
end
|
||||
end
|
||||
|
||||
def enumeration_to_array(ptr)
|
||||
def enum_ptr_to_array(enum_ptr)
|
||||
length = 0
|
||||
check_status do |status|
|
||||
length = CharDet.uenum_count(ptr, status)
|
||||
length = CharDet.uenum_count(enum_ptr, status)
|
||||
end
|
||||
|
||||
result = []
|
||||
0.upto(length - 1) do |idx|
|
||||
check_status { |st| result << CharDet.uenum_next(ptr, nil, st) }
|
||||
check_status { |st| result << CharDet.uenum_next(enum_ptr, nil, st) }
|
||||
end
|
||||
|
||||
result
|
||||
end
|
||||
|
||||
def match_ptr_to_ruby(match_ptr)
|
||||
result = Match.new
|
||||
|
||||
check_status do |ptr|
|
||||
result.name = CharDet.ucsdet_getName(match_ptr, ptr)
|
||||
end
|
||||
|
||||
check_status do |ptr|
|
||||
result.confidence = CharDet.ucsdet_getConfidence(match_ptr, ptr)
|
||||
end
|
||||
|
||||
check_status do |ptr|
|
||||
result.language = CharDet.ucsdet_getLanguage(match_ptr, ptr)
|
||||
end
|
||||
|
||||
result
|
||||
end
|
||||
|
||||
def set_text(text)
|
||||
check_status do |status|
|
||||
CharDet.ucsdet_setText(@detector, text, text.bytesize, status)
|
||||
end
|
||||
end
|
||||
|
||||
end # Detector
|
||||
end # CharDet
|
||||
end # ICU
|
||||
|
|
|
@ -3,20 +3,37 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe ICU::CharDet::Detector do
|
||||
|
||||
before { @d = ICU::CharDet::Detector.new }
|
||||
after { @d.close }
|
||||
|
||||
|
||||
it "should recognize UTF-8" do
|
||||
@d.detect("æåø").name.should == "UTF-8"
|
||||
m = @d.detect("æåø")
|
||||
m.name.should == "UTF-8"
|
||||
m.language.should be_kind_of(String)
|
||||
end
|
||||
|
||||
it "has a list of detectable charsets" do
|
||||
cs = @d.detectable_charsets
|
||||
cs.should be_kind_of(Array)
|
||||
cs.should_not be_empty
|
||||
|
||||
|
||||
cs.first.should be_kind_of(String)
|
||||
end
|
||||
|
||||
it "should disable / enable the input filter" do
|
||||
@d.input_filter_enabled?.should be_false
|
||||
@d.input_filter_enabled = true
|
||||
@d.input_filter_enabled?.should be_true
|
||||
end
|
||||
|
||||
it "should should set declared encoding" do
|
||||
@d.declared_encoding = "UTF-8"
|
||||
end
|
||||
|
||||
it "should detect several matching encodings" do
|
||||
r = @d.detect_all("foo bar")
|
||||
r.should be_instance_of(Array)
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
||||
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
||||
|
||||
require "rubygems"
|
||||
require 'icu-chardet-ffi'
|
||||
require 'spec'
|
||||
require 'spec/autorun'
|
||||
|
|
Loading…
Reference in New Issue