New upstream version 0.7.5

parent ae5a9582
/Gemfile.lock
.bundle/
tmp/
vendor/
*.bundle
ext/charlock_holmes/dst
*.a
ext/charlock_holmes/src/file-*
ext/charlock_holmes/src/mkmf.log
\ No newline at end of file
Copyright (c) 2011 Brian Lopez - https://github.com/brianmario
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
# CharlockHolmes
Character encoding detecting library for Ruby using [ICU](http://site.icu-project.org/)
## Usage
First you'll need to require it
``` ruby
require 'charlock_holmes'
```
## Encoding detection
``` ruby
contents = File.read('test.xml')
detection = CharlockHolmes::EncodingDetector.detect(contents)
# => {:encoding => 'UTF-8', :confidence => 100, :type => :text}
# optionally there will be a :language key as well, but
# that's mostly only returned for legacy encodings like ISO-8859-1
```
NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
For binary content, `:type` will be set to `:binary`
Though it's more efficient to reuse once detector instance:
``` ruby
detector = CharlockHolmes::EncodingDetector.new
detection1 = detector.detect(File.read('test.xml'))
detection2 = detector.detect(File.read('test2.json'))
# and so on...
```
### String monkey patch
Alternatively, you can just use the `detect_encoding` method on the `String` class
``` ruby
require 'charlock_holmes/string'
contents = File.read('test.xml')
detection = contents.detect_encoding
```
### Ruby 1.9 specific
NOTE: This method only exists on Ruby 1.9+
If you want to use this library to detect and set the encoding flag on strings, you can use the `detect_encoding!` method on the `String` class
``` ruby
require 'charlock_holmes/string'
contents = File.read('test.xml')
# this will detect and set the encoding of `contents`, then return self
contents.detect_encoding!
```
## Transcoding
Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
``` ruby
content = File.read('test2.txt')
detection = CharlockHolmes::EncodingDetector.detect(content)
utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
```
The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
## Installing
If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to
your installation of ICU using the `--with-icu-dir` option during the gem install or by configuring Bundler to
pass those arguments to Gem:
Configure Bundler to always use the correct arguments when installing:
bundle config build.charlock_holmes --with-icu-dir=/path/to/installed/icu4c
Using Gem to install directly without Bundler:
gem install charlock_holmes -- --with-icu-dir=/path/to/installed/icu4c
### Homebrew
If you're installing on Mac OS X then using [Homebrew](http://mxcl.github.com/homebrew/) is
the easiest way to install ICU.
However, be warned; it is a Keg-Only (see [homedir issue #167](https://github.com/mxcl/homebrew/issues/167)
for more info) install meaning RubyGems won't find it when installing without specifying `--with-icu-dir`
To install ICU with Homebrew:
brew install icu4c
Configure Bundler to always use the correct arguments when installing:
bundle config build.charlock_holmes --with-icu-dir=/usr/local/opt/icu4c
Using Gem to install directly without Bundler:
gem install charlock_holmes -- --with-icu-dir=/usr/local/opt/icu4c
require 'rake/testtask'
Rake::TestTask.new do |t|
t.pattern = "test/**/*_test.rb"
end
task :default => :test
gem 'rake-compiler', '>= 0.7.5'
require "rake/extensiontask"
Rake::ExtensionTask.new 'charlock_holmes' do |ext|
ext.lib_dir = File.join 'lib', 'charlock_holmes'
end
Rake::Task[:test].prerequisites << :compile
\ No newline at end of file
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
RUBY_19 = !!(RUBY_VERSION =~ /1.9/)
require 'charlock_holmes'
# the chardet gem isn't compatible with 1.9
require 'UniversalDetector' unless RUBY_19
require 'benchmark'
CONTENT = File.read(File.expand_path('../test.txt', __FILE__))
TIMES = 100
DETECTOR = CharlockHolmes::EncodingDetector.new
Benchmark.bmbm do |x|
# new detector every iteration
x.report 'singleton call' do
TIMES.times do
CharlockHolmes::EncodingDetector.detect CONTENT
end
end
# shared detector for all iterations
x.report 'reusing a single detector' do
TIMES.times do
DETECTOR.detect CONTENT
end
end
unless RUBY_19
x.report 'chardet' do
TIMES.times do
UniversalDetector.chardet CONTENT
end
end
end
end
This diff is collapsed.
# encoding: utf-8
require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
#########################################################
# This file has been automatically generated by gem2tgz #
#########################################################
# -*- encoding: utf-8 -*-
Gem::Specification.new do |s|
s.name = %q{charlock_holmes}
s.license = "MIT"
s.version = CharlockHolmes::VERSION
s.authors = ["Brian Lopez", "Vicent Martí"]
s.date = Time.now.utc.strftime("%Y-%m-%d")
s.email = %q{seniorlopez@gmail.com}
s.name = "charlock_holmes"
s.version = "0.7.5"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Brian Lopez", "Vicent Mart\u{ed}"]
s.date = "2017-08-14"
s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu"
s.email = "seniorlopez@gmail.com"
s.extensions = ["ext/charlock_holmes/extconf.rb"]
s.files = `git ls-files`.split("\n")
s.homepage = %q{https://github.com/brianmario/charlock_holmes}
s.files = ["ext/charlock_holmes/common.h", "ext/charlock_holmes/converter.c", "ext/charlock_holmes/encoding_detector.c", "ext/charlock_holmes/ext.c", "ext/charlock_holmes/extconf.rb", "ext/charlock_holmes/transliterator.cpp", "lib/charlock_holmes.rb", "lib/charlock_holmes/encoding_detector.rb", "lib/charlock_holmes/string.rb", "lib/charlock_holmes/version.rb"]
s.homepage = "https://github.com/brianmario/charlock_holmes"
s.licenses = ["MIT"]
s.rdoc_options = ["--charset=UTF-8"]
s.require_paths = ["lib"]
s.rubygems_version = %q{1.4.2}
s.summary = %q{Character encoding detection, brought to you by ICU}
s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu"
s.test_files = `git ls-files spec`.split("\n")
s.required_ruby_version = Gem::Requirement.new(">= 1.9.3")
s.rubygems_version = "1.8.23"
s.summary = "Character encoding detection, brought to you by ICU"
if s.respond_to? :specification_version then
s.specification_version = 4
# tests
s.add_development_dependency 'rake-compiler', ">= 0.7.5"
s.add_development_dependency 'minitest'
# benchmarks
s.add_development_dependency 'chardet'
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
s.add_development_dependency(%q<chardet>, [">= 0"])
s.add_development_dependency(%q<minitest>, [">= 0"])
s.add_development_dependency(%q<rake-compiler>, [">= 0.7.5"])
else
s.add_dependency(%q<chardet>, [">= 0"])
s.add_dependency(%q<minitest>, [">= 0"])
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
end
else
s.add_dependency(%q<chardet>, [">= 0"])
s.add_dependency(%q<minitest>, [">= 0"])
s.add_dependency(%q<rake-compiler>, [">= 0.7.5"])
end
end
......@@ -11,7 +11,7 @@
#include <ruby/encoding.h>
#endif
static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
......@@ -20,7 +20,7 @@ static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
#endif
}
static VALUE charlock_new_str(const char *str, size_t len)
static inline VALUE charlock_new_str(const char *str, size_t len)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
......@@ -29,7 +29,7 @@ static VALUE charlock_new_str(const char *str, size_t len)
#endif
}
static VALUE charlock_new_str2(const char *str)
static inline VALUE charlock_new_str2(const char *str)
{
#ifdef HAVE_RUBY_ENCODING_H
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
......@@ -38,4 +38,4 @@ static VALUE charlock_new_str2(const char *str)
#endif
}
#endif
\ No newline at end of file
#endif
......@@ -129,6 +129,23 @@ static int detect_binary_content(VALUE self, VALUE rb_str) {
return !!memchr(buf, 0, buf_len);
}
/*
* call-seq: true/false = EncodingDetector.is_binary? str
*
* Attempt to detect if a string is binary or text
*
* str - a String, what you want to perform the binary check on
*
* Returns: true or false
*/
static VALUE rb_encdec_is_binary(VALUE self, VALUE str)
{
if (detect_binary_content(self, str))
return Qtrue;
else
return Qfalse;
}
/*
* call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
*
......@@ -350,6 +367,7 @@ void _init_charlock_encoding_detector()
{
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1);
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
......
......@@ -24,6 +24,8 @@ end
dir_config 'icu'
rubyopt = ENV.delete("RUBYOPT")
icu4c = "/usr"
# detect homebrew installs
if !have_library 'icui18n'
base = if !`which brew`.empty?
......@@ -50,6 +52,13 @@ have_library 'z' or abort 'libz missing'
have_library 'icuuc' or abort 'libicuuc missing'
have_library 'icudata' or abort 'libicudata missing'
# icu4c might be built in C++11 mode, but it also might not have been
icuconfig = `which icu-config`.chomp
icuconfig = "#{icu4c}/bin/icu-config" if icuconfig.empty?
if File.exist?(icuconfig) && `#{icuconfig} --cxxflags`.include?("c++11")
$CXXFLAGS << ' -std=c++11'
end
$CFLAGS << ' -Wall -funroll-loops'
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
......
module CharlockHolmes
VERSION = "0.7.3"
VERSION = "0.7.5"
end
--- !ruby/object:Gem::Specification
name: charlock_holmes
version: !ruby/object:Gem::Version
version: 0.7.3
platform: ruby
authors:
- Brian Lopez
- Vicent Martí
autorequire:
bindir: bin
cert_chain: []
date: 2014-06-07 00:00:00.000000000 Z
dependencies:
- !ruby/object:Gem::Dependency
name: rake-compiler
requirement: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: 0.7.5
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: 0.7.5
- !ruby/object:Gem::Dependency
name: minitest
requirement: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
- !ruby/object:Gem::Dependency
name: chardet
requirement: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
description: charlock_holmes provides binary and text detection as well as text transcoding
using libicu
email: seniorlopez@gmail.com
executables: []
extensions:
- ext/charlock_holmes/extconf.rb
extra_rdoc_files: []
files:
- .gitignore
- Gemfile
- MIT-LICENSE
- README.md
- Rakefile
- benchmark/detection.rb
- benchmark/test.txt
- charlock_holmes.gemspec
- ext/charlock_holmes/common.h
- ext/charlock_holmes/converter.c
- ext/charlock_holmes/encoding_detector.c
- ext/charlock_holmes/ext.c
- ext/charlock_holmes/extconf.rb
- ext/charlock_holmes/transliterator.cpp
- lib/charlock_holmes.rb
- lib/charlock_holmes/encoding_detector.rb
- lib/charlock_holmes/string.rb
- lib/charlock_holmes/version.rb
- test/converter_test.rb
- test/encoding_detector_test.rb
- test/fixtures/AnsiGraph.psm1
- test/fixtures/ISO-2022-KR.txt
- test/fixtures/TwigExtensionsDate.es.yml
- test/fixtures/cl-messagepack.lisp
- test/fixtures/core.rkt
- test/fixtures/foo.pdf
- test/fixtures/hello_world
- test/fixtures/laholator.py
- test/fixtures/octocat.ai
- test/fixtures/octocat.gif
- test/fixtures/octocat.jpg
- test/fixtures/octocat.png
- test/fixtures/octocat.psd
- test/fixtures/repl2.cljs
- test/fixtures/sierpinski.ps
- test/fixtures/utf16be.html
- test/fixtures/utf32be.html
- test/fixtures/utf32le.html
- test/fixtures/utf8.html
- test/fixtures/vimrc
- test/helper.rb
- test/string_methods_test.rb
- test/transliterator_test.rb
homepage: https://github.com/brianmario/charlock_holmes
licenses:
- MIT
metadata: {}
post_install_message:
rdoc_options:
- --charset=UTF-8
require_paths:
- lib
required_ruby_version: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
required_rubygems_version: !ruby/object:Gem::Requirement
requirements:
- - '>='
- !ruby/object:Gem::Version
version: '0'
requirements: []
rubyforge_project:
rubygems_version: 2.0.3
signing_key:
specification_version: 4
summary: Character encoding detection, brought to you by ICU
test_files: []
# encoding: utf-8
require File.expand_path("../helper", __FILE__)
class ConverterTest < MiniTest::Test
def test_convert_ascii_from_iso859_1_to_utf16_and_back
input = 'test'
output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
assert input.bytesize < output.bytesize
assert input != output
output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
assert input.bytesize == output.bytesize
assert input == output
end
def test_convert_utf8_to_utf16_and_back
input = 'λ, λ, λ'
output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
assert input.bytesize < output.bytesize
assert input != output
output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
assert input.bytesize == output.bytesize
assert input == output
end
def test_params_must_be_strings
assert_raises TypeError do
CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16'
end
assert_raises TypeError do
CharlockHolmes::Converter.convert 'lol', nil, 'UTF-16'
end
assert_raises TypeError do
CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil
end
begin
CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16'
rescue Exception => e
assert_nil e, "#{e.class.name} raised, expected nothing"
end
end
end
\ No newline at end of file
# encoding: utf-8
require File.expand_path("../helper", __FILE__)
class EncodingDetectorTest < MiniTest::Test
def setup
@detector = CharlockHolmes::EncodingDetector.new
end
def test_has_class_level_detect_method
CharlockHolmes::EncodingDetector.respond_to? :detect
detected = CharlockHolmes::EncodingDetector.detect 'test'
assert_equal 'ISO-8859-1', detected[:encoding]
end
def test_class_level_detect_accepts_encoding_hint
CharlockHolmes::EncodingDetector.respond_to? :detect
detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
assert_equal 'ISO-8859-1', detected[:encoding]
end
def test_has_class_level_detect_all_method
CharlockHolmes::EncodingDetector.respond_to? :detect_all
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
assert detected_list.is_a? Array
encoding_list = detected_list.map {|d| d[:encoding]}.sort
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
end
def test_class_level_detect_all_method_accepts_encoding_hint
CharlockHolmes::EncodingDetector.respond_to? :detect_all
detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
assert detected_list.is_a? Array
encoding_list = detected_list.map {|d| d[:encoding]}.sort
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
end
def test_has_detect_method
@detector.respond_to? :detect
detected = @detector.detect 'test'
assert_equal 'ISO-8859-1', detected[:encoding]
end
def test_detect_accepts_encoding_hint
@detector.respond_to? :detect
detected = @detector.detect 'test', 'UTF-8'
assert_equal 'ISO-8859-1', detected[:encoding]
end
def test_has_detect_all_method
@detector.respond_to? :detect_all
detected_list = @detector.detect_all 'test'
assert detected_list.is_a? Array
encoding_list = detected_list.map {|d| d[:encoding]}.sort
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
end
def test_detect_all_accepts_encoding_hint
@detector.respond_to? :detect_all
detected_list = @detector.detect_all 'test', 'UTF-8'
assert detected_list.is_a? Array
encoding_list = detected_list.map {|d| d[:encoding]}.sort
assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
end
def test_strip_tags_flag
detector = CharlockHolmes::EncodingDetector.new
detector.strip_tags = true
assert detector.strip_tags
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
assert_equal 'UTF-8', detection[:encoding]
detector.strip_tags = false
assert !detector.strip_tags
detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
assert_equal 'UTF-8', detection[:encoding]
end
def test_has_list_of_supported_encodings
CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
assert supported_encodings.is_a?(Array)
assert supported_encodings.include? 'UTF-8'
assert supported_encodings.include? 'windows-1250'
assert supported_encodings.include? 'windows-1252'
assert supported_encodings.include? 'windows-1253'
assert supported_encodings.include? 'windows-1254'
assert supported_encodings.include? 'windows-1255'
end
def test_returns_a_ruby_compatible_encoding_name
detected = @detector.detect 'test'
assert_equal 'ISO-8859-1', detected[:encoding]
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
not_compat_txt = fixture("ISO-2022-KR.txt").read
detected = @detector.detect not_compat_txt
assert_equal 'ISO-2022-KR', detected[:encoding]
assert_equal 'binary', detected[:ruby_encoding]
end
MAPPING = [
['repl2.cljs', 'ISO-8859-1', :text],
['cl-messagepack.lisp', 'ISO-8859-1', :text],
['sierpinski.ps', 'ISO-8859-1', :text],
['core.rkt', 'UTF-8', :text],
['TwigExtensionsDate.es.yml', 'UTF-8', :text],
['laholator.py', 'UTF-8', :text],
['vimrc', 'UTF-8', :text],
['AnsiGraph.psm1', 'UTF-16LE', :text],
['utf16be.html', 'UTF-16BE', :text],
['utf32le.html', 'UTF-32LE', :text],
['utf32be.html', 'UTF-32BE', :text],
['hello_world', nil, :binary],
['octocat.png', nil, :binary],
['octocat.jpg', nil, :binary],
['octocat.psd', nil, :binary],
['octocat.gif', nil, :binary],
['octocat.ai', nil, :binary],
['foo.pdf', nil, :binary],
]
def test_detection_works_as_expected
MAPPING.each do |mapping|
file, encoding, type = mapping
content = fixture(file).read
guessed = @detector.detect content
assert_equal encoding, guessed[:encoding]
assert_equal type, guessed[:type]
if content.respond_to?(:force_encoding) && guessed[:type] == :text
content.force_encoding guessed[:encoding]
assert content.valid_encoding?
end
end
end
end
Binary files a/test/fixtures/AnsiGraph.psm1 and /dev/null differ
$)C#
# Out-AnsiGraph.psm1
# Author: xcud
# History:
# v0.1 September 21, 2009 initial version
#
# PS Example> ps | select -first 5 | sort -property VM |
# Out-AnsiGraph ProcessName, VM
# AEADISRV  14508032
# audiodg  50757632
# conhost  73740288
# AppleMobileDeviceService  92061696
# btdna  126443520
#
function Out-AnsiGraph($Parameter1=$null) {
BEGIN {
$q = new-object Collections.queue
$max = 0; $namewidth = 0;
}
PROCESS {
if($_) {
$name = $_.($Parameter1[0]);
$val = $_.($Parameter1[1])
if($max -lt $val) { $max = $val}
if($namewidth -lt $name.length) {
$namewidth = $name.length }
$q.enqueue(@($name, $val))
}
}
END {
$q | %{
$graph = ""; 0..($_[1]/$max*20) |
%{ $graph += "" }
$name = "{0,$namewidth}" -f $_[0]
"$name $graph " + $_[1]
}
}
}
Export-ModuleMember Out-AnsiGraph