Commit 597e31cd authored by Ondrej Sury's avatar Ondrej Sury

Imported Upstream version 0.6.9.4

parents
.bundle/
tmp/
vendor/
*.bundle
ext/charlock_holmes/dst
*.a
ext/charlock_holmes/src/file-*
ext/charlock_holmes/src/mkmf.log
\ No newline at end of file
source :rubygems
gemspec
\ No newline at end of file
PATH
remote: .
specs:
charlock_holmes (0.6.9.4)
GEM
remote: http://rubygems.org/
specs:
chardet (0.9.0)
minitest (4.6.2)
rake (0.9.2)
rake-compiler (0.7.9)
rake
PLATFORMS
ruby
DEPENDENCIES
chardet
charlock_holmes!
minitest
rake-compiler (>= 0.7.5)
Copyright (c) 2011 Brian Lopez - http://github.com/brianmario
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
# CharlockHolmes
Character encoding detecting library for Ruby using [ICU](http://site.icu-project.org/)
## Usage
First you'll need to require it
``` ruby
require 'charlock_holmes'
```
## Encoding detection
``` ruby
contents = File.read('test.xml')
detection = CharlockHolmes::EncodingDetector.detect(contents)
# => {:encoding => 'UTF-8', :confidence => 100, :type => :text}
# optionally there will be a :language key as well, but
# that's mostly only returned for legacy encodings like ISO-8859-1
```
NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
For binary content, `:type` will be set to `:binary`
Though it's more efficient to reuse once detector instance:
``` ruby
detector = CharlockHolmes::EncodingDetector.new
detection1 = detector.detect(File.read('test.xml'))
detection2 = detector.detect(File.read('test2.json'))
# and so on...
```
### String monkey patch
Alternatively, you can just use the `detect_encoding` method on the `String` class
``` ruby
require 'charlock_holmes/string'
contents = File.read('test.xml')
detection = contents.detect_encoding
```
### Ruby 1.9 specific
NOTE: This method only exists on Ruby 1.9+
If you want to use this library to detect and set the encoding flag on strings, you can use the `detect_encoding!` method on the `String` class
``` ruby
require 'charlock_holmes/string'
contents = File.read('test.xml')
# this will detect and set the encoding of `contents`, then return self
contents.detect_encoding!
```
## Transcoding
Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
``` ruby
content = File.read('test2.txt')
detection = CharlockHolmes::EncodingDetector.detect(content)
utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
```
The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
## Installing
If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to
your installation of ICU using the `--with-icu-dir` option during the gem install or by configuring Bundler to
pass those arguments to Gem:
Configure Bundler to always use the correct arguments when installing:
bundle config build.charlock_holmes --with-icu-dir=/path/to/installed/icu4c
Using Gem to install directly without Bundler:
gem install charlock_holmes -- --with-icu-dir=/path/to/installed/icu4c
### Homebrew
If you're installing on Mac OS X then using [Homebrew](http://mxcl.github.com/homebrew/) is
the easiest way to install ICU.
However, be warned; it is a Keg-Only (see [homedir issue #167](https://github.com/mxcl/homebrew/issues/167)
for more info) install meaning RubyGems won't find it when installing without specifying `--with-icu-dir`
To install ICU with Homebrew:
brew install icu4c
Configure Bundler to always use the correct arguments when installing:
bundle config build.charlock_holmes --with-icu-dir=/usr/local/opt/icu4c
Using Gem to install directly without Bundler:
gem install charlock_holmes -- --with-icu-dir=/usr/local/opt/icu4c
require 'rake/testtask'
Rake::TestTask.new do |t|
t.pattern = "test/**/*_test.rb"
end
task :default => :test
gem 'rake-compiler', '>= 0.7.5'
require "rake/extensiontask"
Rake::ExtensionTask.new 'charlock_holmes' do |ext|
ext.lib_dir = File.join 'lib', 'charlock_holmes'
end
Rake::Task[:test].prerequisites << :compile
\ No newline at end of file
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
RUBY_19 = !!(RUBY_VERSION =~ /1.9/)
require 'charlock_holmes'
# the chardet gem isn't compatible with 1.9
require 'UniversalDetector' unless RUBY_19
require 'benchmark'
CONTENT = File.read(File.expand_path('../test.txt', __FILE__))
TIMES = 100
DETECTOR = CharlockHolmes::EncodingDetector.new
Benchmark.bmbm do |x|
# new detector every iteration
x.report 'singleton call' do
TIMES.times do
CharlockHolmes::EncodingDetector.detect CONTENT
end
end
# shared detector for all iterations
x.report 'reusing a single detector' do
TIMES.times do
DETECTOR.detect CONTENT
end
end
unless RUBY_19
x.report 'chardet' do
TIMES.times do
UniversalDetector.chardet CONTENT
end
end
end
end
This diff is collapsed.
# encoding: utf-8
require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
Gem::Specification.new do |s|
s.name = %q{charlock_holmes}
s.version = CharlockHolmes::VERSION
s.authors = ["Brian Lopez", "Vicent Martí"]
s.date = Time.now.utc.strftime("%Y-%m-%d")
s.email = %q{seniorlopez@gmail.com}
s.extensions = ["ext/charlock_holmes/extconf.rb"]
s.files = `git ls-files`.split("\n")
s.homepage = %q{http://github.com/brianmario/charlock_holmes}
s.rdoc_options = ["--charset=UTF-8"]
s.require_paths = ["lib"]
s.rubygems_version = %q{1.4.2}
s.summary = %q{Character encoding detection, brought to you by ICU}
s.test_files = `git ls-files spec`.split("\n")
# tests
s.add_development_dependency 'rake-compiler', ">= 0.7.5"
s.add_development_dependency 'minitest'
# benchmarks
s.add_development_dependency 'chardet'
end
#ifndef CHARLOCK_COMMON_H
#define CHARLOCK_COMMON_H
// tell rbx not to use it's caching compat layer
// by doing this we're making a promize to RBX that
// we'll never modify the pointers we get back from RSTRING_PTR
#define RSTRING_NOT_MODIFIED
#include <ruby.h>
#ifdef HAVE_RUBY_ENCODING_H
#include <ruby/encoding.h>
#endif
static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
#else
return rb_str_new(str, len);
#endif
}
static VALUE charlock_new_str(const char *str, size_t len)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
#else
return rb_str_new(str, len);
#endif
}
static VALUE charlock_new_str2(const char *str)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
#else
return rb_str_new2(str);
#endif
}
#endif
\ No newline at end of file
#include "unicode/ucnv.h"
#include "common.h"
extern VALUE rb_mCharlockHolmes;
static VALUE rb_cConverter;
static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
VALUE rb_out;
const char *src_enc;
const char *dst_enc;
const char *src_txt;
char *out_buf;
void *rb_enc = NULL;
int32_t src_len;
int32_t out_len;
UErrorCode status = U_ZERO_ERROR;
Check_Type(rb_txt, T_STRING);
Check_Type(rb_src_enc, T_STRING);
Check_Type(rb_dst_enc, T_STRING);
src_txt = RSTRING_PTR(rb_txt);
src_len = RSTRING_LEN(rb_txt);
src_enc = RSTRING_PTR(rb_src_enc);
dst_enc = RSTRING_PTR(rb_dst_enc);
// first determin the size of the output buffer
out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
rb_raise(rb_eArgError, "%s", u_errorName(status));
}
out_buf = malloc(out_len);
// now do the actual conversion
status = U_ZERO_ERROR;
out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
if (U_FAILURE(status)) {
free(out_buf);
rb_raise(rb_eArgError, "%s", u_errorName(status));
}
#ifdef HAVE_RUBY_ENCODING_H
rb_enc = (void *)rb_enc_find(dst_enc);
#endif
rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
free(out_buf);
return rb_out;
}
void _init_charlock_converter() {
rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
}
#include "unicode/ucsdet.h"
#include "magic.h"
#include "common.h"
extern VALUE rb_mCharlockHolmes;
static VALUE rb_cEncodingDetector;
typedef struct {
UCharsetDetector *csd;
magic_t magic;
} charlock_detector_t;
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
{
UErrorCode status = U_ZERO_ERROR;
const char *mname;
const char *mlang;
int mconfidence;
VALUE rb_match;
if (!match)
return Qnil;
mname = ucsdet_getName(match, &status);
mlang = ucsdet_getLanguage(match, &status);
mconfidence = ucsdet_getConfidence(match, &status);
rb_match = rb_hash_new();
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
if (mlang && mlang[0])
rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang));
return rb_match;
}
static VALUE rb_encdec_binarymatch() {
VALUE rb_match;
rb_match = rb_hash_new();
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
return rb_match;
}
static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
const char *binary_result;
binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
if (binary_result) {
if (!strstr(binary_result, "text"))
return 1;
} else {
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
}
return 0;
}
/*
* call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
*
* Attempt to detect the encoding of this string
*
* str - a String, what you want to detect the encoding of
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
* be used as an additional hint to the charset detector
*
* Returns: a Hash with :encoding, :language, :type and :confidence
*/
static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
{
UErrorCode status = U_ZERO_ERROR;
charlock_detector_t *detector;
VALUE rb_str;
VALUE rb_enc_hint;
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
Check_Type(rb_str, T_STRING);
Data_Get_Struct(self, charlock_detector_t, detector);
// first lets see if this is binary content
if (detect_binary_content(detector, rb_str)) {
return rb_encdec_binarymatch();
}
// if we got here - the data doesn't look like binary
// lets try to figure out what encoding the text is in
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
if (!NIL_P(rb_enc_hint)) {
Check_Type(rb_enc_hint, T_STRING);
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
}
return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
}
/*
* call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
*
* Attempt to detect the encoding of this string, and return
* a list with all the possible encodings that match it.
*
*
* str - a String, what you want to detect the encoding of
* hint_enc - an optional String (like "UTF-8"), the encoding name which will
* be used as an additional hint to the charset detector
*
* Returns: an Array with zero or more Hashes,
* each one of them with with :encoding, :language, :type and :confidence
*/
static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
{
UErrorCode status = U_ZERO_ERROR;
charlock_detector_t *detector;
const UCharsetMatch **csm;
VALUE rb_ret;
int i, match_count;
VALUE rb_str;
VALUE rb_enc_hint;
VALUE binary_match;
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
Check_Type(rb_str, T_STRING);
Data_Get_Struct(self, charlock_detector_t, detector);
rb_ret = rb_ary_new();
// first lets see if this is binary content
binary_match = Qnil;
if (detect_binary_content(detector, rb_str)) {
binary_match = rb_encdec_binarymatch();
}
ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
if (!NIL_P(rb_enc_hint)) {
Check_Type(rb_enc_hint, T_STRING);
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
}
csm = ucsdet_detectAll(detector->csd, &match_count, &status);
for (i = 0; i < match_count; ++i) {
rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
}
if (!NIL_P(binary_match))
rb_ary_unshift(rb_ret, binary_match);
return rb_ret;
}
/*
* call-seq: EncodingDetector#strip_tags?
*
* Returns whether or not the strip_tags flag is set on this detector
*
* Returns: Boolean
*/
static VALUE rb_get_strip_tags(VALUE self)
{
charlock_detector_t *detector;
UBool val;
VALUE rb_val;
Data_Get_Struct(self, charlock_detector_t, detector);
val = ucsdet_isInputFilterEnabled(detector->csd);
rb_val = val == 1 ? Qtrue : Qfalse;
return rb_val;
}
/*
* call-seq: EncodingDetector#strip_tags = true
*
* Enable or disable the stripping of HTML/XML tags from the input before
* attempting any detection
*
* Returns: Boolean, the value passed
*/
static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
{
charlock_detector_t *detector;
UBool val;
Data_Get_Struct(self, charlock_detector_t, detector);
val = rb_val == Qtrue ? 1 : 0;
ucsdet_enableInputFilter(detector->csd, val);
return rb_val;
}
/*
* call-seq: detectable_encodings = EncodingDetector.supported_encodings
*
* The list of detectable encodings supported by this library
*
* Returns: an Array of Strings
*/
static VALUE rb_get_supported_encodings(VALUE klass)
{
UCharsetDetector *csd;
UErrorCode status = U_ZERO_ERROR;
UEnumeration *encoding_list;
VALUE rb_encoding_list;
int32_t enc_count;
int32_t i;
const char *enc_name;
int32_t enc_name_len;
rb_encoding_list = rb_iv_get(klass, "encoding_list");
// lazily populate the list
if (NIL_P(rb_encoding_list)) {
csd = ucsdet_open(&status);
encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
rb_encoding_list = rb_ary_new();
enc_count = uenum_count(encoding_list, &status);
for(i=0; i < enc_count; i++) {
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
}
rb_iv_set(klass, "encoding_list", rb_encoding_list);
ucsdet_close(csd);
}
return rb_encoding_list;
}
static void rb_encdec__free(void *obj)
{
charlock_detector_t *detector;
detector = (charlock_detector_t *)obj;
if (detector->csd)
ucsdet_close(detector->csd);
if (detector->magic)
magic_close(detector->magic);
free(detector);
}
static VALUE rb_encdec__alloc(VALUE klass)
{
charlock_detector_t *detector;
UErrorCode status = U_ZERO_ERROR;
VALUE obj;
detector = calloc(1, sizeof(charlock_detector_t));
obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector);
detector->csd = ucsdet_open(&status);
if (U_FAILURE(status)) {
rb_raise(rb_eStandardError, "%s", u_errorName(status));
}
detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
if (detector->magic == NULL) {
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
}
return obj;
}
void _init_charlock_encoding_detector()
{
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
}
#include "common.h"
extern void _init_charlock_encoding_detector();
extern void _init_charlock_converter();
extern void _init_charlock_transliterator();
VALUE rb_mCharlockHolmes;
void Init_charlock_holmes() {
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
_init_charlock_encoding_detector();
_init_charlock_converter();
_init_charlock_transliterator();
}
\ No newline at end of file
require 'mkmf'
CWD = File.expand_path(File.dirname(__FILE__))
def sys(cmd)
puts " -- #{cmd}"
unless ret = xsystem(cmd)
raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes"
end
ret
end
if `which make`.strip.empty?
STDERR.puts "\n\n"
STDERR.puts "***************************************************************************************"
STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************"
STDERR.puts "***************************************************************************************"
exit(1)
end
##
# ICU dependency
#
dir_config 'icu'
# detect homebrew installs
if !have_library 'icui18n'
base = if !`which brew`.empty?
`brew --prefix`.strip
elsif File.exists?("/usr/local/Cellar/icu4c")
'/usr/local/Cellar'
end
if base and icu4c = Dir[File.join(base, 'Cellar/icu4c/*')].sort.last
$INCFLAGS << " -I#{icu4c}/include "
$LDFLAGS << " -L#{icu4c}/lib "
end
end
unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
STDERR.puts "\n\n"
STDERR.puts "***************************************************************************************"
STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********"
STDERR.puts "***************************************************************************************"
exit(1)
end
##
# libmagic dependency
#
src = File.basename('file-5.08.tar.gz')
dir = File.basename(src, '.tar.gz')
Dir.chdir("#{CWD}/src") do
FileUtils.rm_rf(dir) if File.exists?(dir)
sys("tar zxvf #{src}")
Dir.chdir(dir) do
sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
sys("patch -p0 < ../file-soft-check.patch")
sys("make -C src install")
sys("make -C magic install")
end