class ICU::CharsetDetector

Overview

Charset detection

This class provides a facility for detecting the charset or encoding of character data in an unknown text format.

Usage

csdet = ICU::CharsetDetector.new
csm = csdet.detect("Sôme text")
csm.name       # => "UTF-8"
csm.confidence # => 80

See also

Defined in:

icu/charset_detector.cr

Constructors

Class Method Summary

Instance Method Summary

Constructor Detail

def self.new #

[View source]

Class Method Detail

def self.detectable_charsets : Array(String) #

Returns the list of detectable charsets

(see ICU::CharsetDetector#detectable_charsets)


[View source]

Instance Method Detail

def detect(text : String) : CharsetMatch #

Return the charset that best matches the supplied input data

csdet = ICU::CharsetDetector.new

csm = csdet.detect("Some text")
csm.name       # => "ISO-8859-1"
csm.confidence # => 30

csm = csdet.detect("Sôme other text")
csm.name       # => "UTF-8"
csm.confidence # => 80

FIXME not thread-safe


[View source]
def detect_all(text : String) : Array(CharsetMatch) #

Find all charset matches that appear to be consistent with the input. The results are ordered with the best quality match first.

csms = csdet.detect_all("Some text")
csdet.detect_all(str).map { |csm| {name: csm.name, confidence: csm.confidence} }
# => [{name: "ISO-8859-1", confidence: 30},
#     {name: "ISO-8859-2", confidence: 30},
#     {name: "UTF-8", confidence: 15},
#     {name: "UTF-16BE", confidence: 10},
#     {name: "UTF-16LE", confidence: 10}]

FIXME not thread-safe


[View source]
def detectable_charsets : Array(String) #

Returns the list of detectable charsets

ICU::CharsetDetector.new.detectable_charsets
# => ["UTF-8",
#     "UTF-16BE",
#     "UTF-16LE",
#     ...]

[View source]
def finalize #

[View source]
def to_unsafe : LibICU::UCharsetDetector #

[View source]