Implement more of the API

This commit is contained in:
Jari Bakken 2010-05-09 22:29:55 +02:00
parent 3b96bf3c73
commit 47b366a11d
5 changed files with 108 additions and 25 deletions

View File

@ -32,6 +32,11 @@ I've tested this on these platforms:
- Debian Linux
- Arch Linux
and these rubies:
- MRI 1.9.1
- MRI 1.8.7
YMMV.
== Note on Patches/Pull Requests

View File

@ -4,14 +4,14 @@ require 'rake'
begin
require 'jeweler'
Jeweler::Tasks.new do |gem|
gem.name = "icu-chardet-ffi"
gem.summary = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
gem.name = "icu-chardet-ffi"
gem.summary = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
gem.description = %Q{Tiny FFI wrapper for ICU's UCharsetDetector.}
gem.email = "jari.bakken@gmail.com"
gem.homepage = "http://github.com/jarib/icu-chardet-ffi"
gem.authors = ["Jari Bakken"]
gem.email = "jari.bakken@gmail.com"
gem.homepage = "http://github.com/jarib/icu-chardet-ffi"
gem.authors = ["Jari Bakken"]
gem.add_dependency "ffi", "0.6.3"
gem.add_dependency "ffi", ">= 0.6.3"
gem.add_development_dependency "rspec", ">= 1.3.0"
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
end

View File

@ -17,13 +17,22 @@ module ICU
suffix = '_44'
end
#
# http://icu-project.org/apiref/icu4c/ucsdet_8h.html
#
attach_function "ucsdet_open#{suffix}", [:pointer], :pointer
attach_function "ucsdet_close#{suffix}", [:pointer], :void
attach_function "ucsdet_setText#{suffix}", [:pointer, :string, :int, :pointer], :void
attach_function "ucsdet_setDeclaredEncoding#{suffix}", [:pointer, :string, :int, :pointer], :void
attach_function "ucsdet_detect#{suffix}", [:pointer, :pointer], :pointer
attach_function "ucsdet_detectAll#{suffix}", [:pointer, :pointer, :pointer], :pointer
attach_function "ucsdet_getName#{suffix}", [:pointer, :pointer], :string
attach_function "ucsdet_getConfidence#{suffix}", [:pointer, :pointer], :int
attach_function "ucsdet_getLanguage#{suffix}", [:pointer, :pointer], :string
attach_function "ucsdet_getAllDetectableCharsets#{suffix}", [:pointer, :pointer], :pointer
attach_function "ucsdet_isInputFilterEnabled#{suffix}", [:pointer], :bool
attach_function "ucsdet_enableInputFilter#{suffix}", [:pointer, :bool], :bool
attach_function "u_errorName#{suffix}", [:int], :string
attach_function "uenum_count#{suffix}", [:pointer, :pointer], :int
attach_function "uenum_close#{suffix}", [:pointer], :void
@ -38,7 +47,14 @@ module ICU
alias_method :ucsdet_detect, "ucsdet_detect#{suffix}"
alias_method :ucsdet_getName, "ucsdet_getName#{suffix}"
alias_method :ucsdet_getConfidence, "ucsdet_getConfidence#{suffix}"
alias_method :ucsdet_getLanguage, "ucsdet_getLanguage#{suffix}"
alias_method :ucsdet_isInputFilterEnabled, "ucsdet_isInputFilterEnabled#{suffix}"
alias_method :ucsdet_enableInputFilter, "ucsdet_enableInputFilter#{suffix}"
alias_method :ucsdet_setDeclaredEncoding, "ucsdet_setDeclaredEncoding#{suffix}"
alias_method :u_errorName, "u_errorName#{suffix}"
alias_method :uenum_count, "uenum_count#{suffix}"
alias_method :uenum_count, "uenum_close#{suffix}"
alias_method :uenum_count, "uenum_next#{suffix}"
end
end
@ -51,7 +67,7 @@ module ICU
end
class Detector
Match = Struct.new(:name, :confidence)
Match = Struct.new(:name, :confidence, :language)
def initialize
check_status do |ptr|
@ -59,31 +75,50 @@ module ICU
end
end
def input_filter_enabled?
CharDet.ucsdet_isInputFilterEnabled @detector
end
def input_filter_enabled=(bool)
CharDet.ucsdet_enableInputFilter(@detector, !!bool)
end
def declared_encoding=(str)
check_status do |ptr|
CharDet.ucsdet_setDeclaredEncoding(@detector, str, str.bytesize, ptr)
end
end
def close
CharDet.ucsdet_close @detector
end
def detect(str)
check_status do |ptr|
CharDet.ucsdet_setText(@detector, str, str.bytesize, ptr)
end
set_text(str)
match_ptr = nil
check_status do |ptr|
match_ptr = CharDet.ucsdet_detect(@detector, ptr)
end
result = Match.new
check_status do |ptr|
result.name = CharDet.ucsdet_getName(match_ptr, ptr)
match_ptr_to_ruby(match_ptr) unless match_ptr.null?
end
def detect_all(str)
set_text(str)
matches_found_ptr = FFI::MemoryPointer.new :int
array_ptr = nil
check_status do |status|
array_ptr = CharDet.ucsdet_detectAll(@detector, matches_found_ptr, status)
end
check_status do |ptr|
result.confidence = CharDet.ucsdet_getConfidence(match_ptr, ptr)
end
length = matches_found_ptr.read_int
result
array_ptr.read_array_of_pointer(length).map do |match|
match_ptr_to_ruby(match)
end
end
def detectable_charsets
@ -93,7 +128,7 @@ module ICU
enum_ptr = CharDet.ucsdet_getAllDetectableCharsets(@detector, ptr)
end
result = enumeration_to_array(enum_ptr)
result = enum_ptr_to_array(enum_ptr)
CharDet.uenum_close(enum_ptr)
result
@ -112,20 +147,44 @@ module ICU
end
end
def enumeration_to_array(ptr)
def enum_ptr_to_array(enum_ptr)
length = 0
check_status do |status|
length = CharDet.uenum_count(ptr, status)
length = CharDet.uenum_count(enum_ptr, status)
end
result = []
0.upto(length - 1) do |idx|
check_status { |st| result << CharDet.uenum_next(ptr, nil, st) }
check_status { |st| result << CharDet.uenum_next(enum_ptr, nil, st) }
end
result
end
def match_ptr_to_ruby(match_ptr)
result = Match.new
check_status do |ptr|
result.name = CharDet.ucsdet_getName(match_ptr, ptr)
end
check_status do |ptr|
result.confidence = CharDet.ucsdet_getConfidence(match_ptr, ptr)
end
check_status do |ptr|
result.language = CharDet.ucsdet_getLanguage(match_ptr, ptr)
end
result
end
def set_text(text)
check_status do |status|
CharDet.ucsdet_setText(@detector, text, text.bytesize, status)
end
end
end # Detector
end # CharDet
end # ICU

View File

@ -3,20 +3,37 @@
require 'spec_helper'
describe ICU::CharDet::Detector do
before { @d = ICU::CharDet::Detector.new }
after { @d.close }
it "should recognize UTF-8" do
@d.detect("æåø").name.should == "UTF-8"
m = @d.detect("æåø")
m.name.should == "UTF-8"
m.language.should be_kind_of(String)
end
it "has a list of detectable charsets" do
cs = @d.detectable_charsets
cs.should be_kind_of(Array)
cs.should_not be_empty
cs.first.should be_kind_of(String)
end
it "should disable / enable the input filter" do
@d.input_filter_enabled?.should be_false
@d.input_filter_enabled = true
@d.input_filter_enabled?.should be_true
end
it "should should set declared encoding" do
@d.declared_encoding = "UTF-8"
end
it "should detect several matching encodings" do
r = @d.detect_all("foo bar")
r.should be_instance_of(Array)
end
end

View File

@ -1,5 +1,7 @@
$LOAD_PATH.unshift(File.dirname(__FILE__))
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
require "rubygems"
require 'icu-chardet-ffi'
require 'spec'
require 'spec/autorun'