Imported Upstream version 0.7.3

parent a39122bf
/Gemfile.lock
.bundle/
tmp/
vendor/
......
source :rubygems
source "https://rubygems.org"
gemspec
PATH
remote: .
specs:
charlock_holmes (0.6.9.4)
GEM
remote: http://rubygems.org/
specs:
chardet (0.9.0)
minitest (4.6.2)
rake (0.9.2)
rake-compiler (0.7.9)
rake
PLATFORMS
ruby
DEPENDENCIES
chardet
charlock_holmes!
minitest
rake-compiler (>= 0.7.5)
Copyright (c) 2011 Brian Lopez - http://github.com/brianmario
Copyright (c) 2011 Brian Lopez - https://github.com/brianmario
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
......
......@@ -4,17 +4,19 @@ require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION
Gem::Specification.new do |s|
s.name = %q{charlock_holmes}
s.license = "MIT"
s.version = CharlockHolmes::VERSION
s.authors = ["Brian Lopez", "Vicent Martí"]
s.date = Time.now.utc.strftime("%Y-%m-%d")
s.email = %q{seniorlopez@gmail.com}
s.extensions = ["ext/charlock_holmes/extconf.rb"]
s.files = `git ls-files`.split("\n")
s.homepage = %q{http://github.com/brianmario/charlock_holmes}
s.homepage = %q{https://github.com/brianmario/charlock_holmes}
s.rdoc_options = ["--charset=UTF-8"]
s.require_paths = ["lib"]
s.rubygems_version = %q{1.4.2}
s.summary = %q{Character encoding detection, brought to you by ICU}
s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu"
s.test_files = `git ls-files spec`.split("\n")
# tests
......
#include "unicode/ucsdet.h"
#include "magic.h"
#include "common.h"
extern VALUE rb_mCharlockHolmes;
......@@ -7,7 +6,6 @@ static VALUE rb_cEncodingDetector;
typedef struct {
UCharsetDetector *csd;
magic_t magic;
} charlock_detector_t;
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
......@@ -17,6 +15,9 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
const char *mlang;
int mconfidence;
VALUE rb_match;
VALUE enc_tbl;
VALUE enc_name;
VALUE compat_enc;
if (!match)
return Qnil;
......@@ -28,7 +29,16 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
rb_match = rb_hash_new();
rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
enc_name = charlock_new_str2(mname);
rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name);
enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table");
compat_enc = rb_hash_aref(enc_tbl, enc_name);
if (!NIL_P(compat_enc)) {
rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc);
}
rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
if (mlang && mlang[0])
......@@ -48,19 +58,75 @@ static VALUE rb_encdec_binarymatch() {
return rb_match;
}
static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
const char *binary_result;
static int detect_binary_content(VALUE self, VALUE rb_str) {
size_t buf_len, scan_len;
const char *buf;
buf = RSTRING_PTR(rb_str);
buf_len = RSTRING_LEN(rb_str);
scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length"));
if (buf_len > 10) {
// application/postscript
if (!memcmp(buf, "%!PS-Adobe-", 11))
return 0;
}
if (buf_len > 7) {
// image/png
if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8))
return 1;
}
if (buf_len > 5) {
// image/gif
if (!memcmp(buf, "GIF87a", 6))
return 1;
// image/gif
if (!memcmp(buf, "GIF89a", 6))
return 1;
}
if (buf_len > 4) {
// application/pdf
if (!memcmp(buf, "%PDF-", 5))
return 1;
}
if (buf_len > 3) {
// UTF-32BE
if (!memcmp(buf, "\0\0\xfe\xff", 4))
return 0;
binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
// UTF-32LE
if (!memcmp(buf, "\xff\xfe\0\0", 4))
return 0;
}
if (binary_result) {
if (!strstr(binary_result, "text"))
if (buf_len > 2) {
// image/jpeg
if (!memcmp(buf, "\xFF\xD8\xFF", 3))
return 1;
} else {
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
}
if (buf_len > 1) {
// UTF-16BE
if (!memcmp(buf, "\xfe\xff", 2))
return 0;
// UTF-16LE
if (!memcmp(buf, "\xff\xfe", 2))
return 0;
}
/*
* If we got this far, any NULL bytes within the `scan_len`
* range will likely mean the contents are binary.
*/
if (scan_len < buf_len)
buf_len = scan_len;
return !!memchr(buf, 0, buf_len);
}
/*
......@@ -87,7 +153,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
Data_Get_Struct(self, charlock_detector_t, detector);
// first lets see if this is binary content
if (detect_binary_content(detector, rb_str)) {
if (detect_binary_content(self, rb_str)) {
return rb_encdec_binarymatch();
}
......@@ -138,7 +204,7 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
// first lets see if this is binary content
binary_match = Qnil;
if (detect_binary_content(detector, rb_str)) {
if (detect_binary_content(self, rb_str)) {
binary_match = rb_encdec_binarymatch();
}
......@@ -233,6 +299,12 @@ static VALUE rb_get_supported_encodings(VALUE klass)
rb_encoding_list = rb_ary_new();
enc_count = uenum_count(encoding_list, &status);
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250"));
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252"));
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253"));
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254"));
rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255"));
for(i=0; i < enc_count; i++) {
enc_name = uenum_next(encoding_list, &enc_name_len, &status);
rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
......@@ -254,9 +326,6 @@ static void rb_encdec__free(void *obj)
if (detector->csd)
ucsdet_close(detector->csd);
if (detector->magic)
magic_close(detector->magic);
free(detector);
}
......@@ -274,11 +343,6 @@ static VALUE rb_encdec__alloc(VALUE klass)
rb_raise(rb_eStandardError, "%s", u_errorName(status));
}
detector->magic = magic_open(MAGIC_NO_CHECK_SOFT);
if (detector->magic == NULL) {
rb_raise(rb_eStandardError, "%s", magic_error(detector->magic));
}
return obj;
}
......
......@@ -4,7 +4,7 @@ CWD = File.expand_path(File.dirname(__FILE__))
def sys(cmd)
puts " -- #{cmd}"
unless ret = xsystem(cmd)
raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes"
raise "#{cmd} failed, please report issue on https://github.com/brianmario/charlock_holmes"
end
ret
end
......@@ -23,6 +23,7 @@ end
dir_config 'icu'
rubyopt = ENV.delete("RUBYOPT")
# detect homebrew installs
if !have_library 'icui18n'
base = if !`which brew`.empty?
......@@ -45,40 +46,12 @@ unless have_library 'icui18n' and have_header 'unicode/ucnv.h'
exit(1)
end
##
# libmagic dependency
#
src = File.basename('file-5.08.tar.gz')
dir = File.basename(src, '.tar.gz')
Dir.chdir("#{CWD}/src") do
FileUtils.rm_rf(dir) if File.exists?(dir)
sys("tar zxvf #{src}")
Dir.chdir(dir) do
sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic")
sys("patch -p0 < ../file-soft-check.patch")
sys("make -C src install")
sys("make -C magic install")
end
end
FileUtils.cp "#{CWD}/dst/lib/libmagic.a", "#{CWD}/libmagic_ext.a"
$INCFLAGS[0,0] = " -I#{CWD}/dst/include "
$LDFLAGS << " -L#{CWD} "
dir_config 'magic'
unless have_library 'magic_ext' and have_header 'magic.h'
STDERR.puts "\n\n"
STDERR.puts "***************************************************************************************"
STDERR.puts "********* error compiling and linking libmagic. please report issue on github *********"
STDERR.puts "***************************************************************************************"
exit(1)
end
have_library 'z' or abort 'libz missing'
have_library 'icuuc' or abort 'libicuuc missing'
have_library 'icudata' or abort 'libicudata missing'
$CFLAGS << ' -Wall -funroll-loops'
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
ENV['RUBYOPT'] = rubyopt
create_makefile 'charlock_holmes/charlock_holmes'
module CharlockHolmes
class EncodingDetector
# Default length for which to scan content for NULL bytes
DEFAULT_BINARY_SCAN_LEN = 1024*1024
# Length for which to scan content for NULL bytes
attr_accessor :binary_scan_length
alias :strip_tags? :strip_tags
def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN)
@binary_scan_length = scan_len
end
# Attempt to detect the encoding of this string
#
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
# as well as use the default binary scan length
#
# str - a String, what you want to detect the encoding of
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
......@@ -19,6 +30,7 @@ module CharlockHolmes
# a list with all the possible encodings that match it.
#
# NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
# as well as use the default binary scan length
#
# str - a String, what you want to detect the encoding of
# hint_enc - an optional String (like "UTF-8"), the encoding name which will
......@@ -29,5 +41,36 @@ module CharlockHolmes
def self.detect_all(str, hint_enc=nil)
new.detect_all(str, hint_enc)
end
# A mapping table of supported encoding names from EncodingDetector
# which point to the corresponding supported encoding name in Ruby.
# Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"}
#
# Note that encodings that can't be mapped between Charlock and Ruby will resolve
# to "ASCII-8BIT".
@encoding_table = {}
def self.encoding_table
@encoding_table
end
BINARY = 'binary'
# Builds the ENCODING_TABLE hash by running through the list of supported encodings
# in the ICU detection API and trying to map them to supported encodings in Ruby.
# This is built dynamically so as to take advantage of ICU upgrades which may have
# support for more encodings in the future.
#
# Returns nothing.
def self.build_encoding_table
supported_encodings.each do |name|
@encoding_table[name] = begin
::Encoding.find(name).name
rescue ArgumentError
BINARY
end
end
end
build_encoding_table
end
end
......@@ -19,14 +19,14 @@ class String
detector.detect_all(self, hint_enc)
end
if RUBY_VERSION =~ /1.9/
if method_defined? :force_encoding
# Attempt to detect the encoding of this string
# then set the encoding to what was detected ala `force_encoding`
#
# Returns: self
def detect_encoding!(hint_enc=nil)
if detected = self.detect_encoding(hint_enc)
self.force_encoding(detected[:encoding]) if detected[:encoding]
self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding]
end
self
end
......
module CharlockHolmes
VERSION = "0.6.9.4"
VERSION = "0.7.3"
end
--- !ruby/object:Gem::Specification
name: charlock_holmes
version: !ruby/object:Gem::Version
version: 0.6.9.4
prerelease:
version: 0.7.3
platform: ruby
authors:
- Brian Lopez
......@@ -10,57 +9,52 @@ authors:
autorequire:
bindir: bin
cert_chain: []
date: 2013-04-03 00:00:00.000000000 Z
date: 2014-06-07 00:00:00.000000000 Z
dependencies:
- !ruby/object:Gem::Dependency
name: rake-compiler
requirement: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: 0.7.5
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: 0.7.5
- !ruby/object:Gem::Dependency
name: minitest
requirement: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
- !ruby/object:Gem::Dependency
name: chardet
requirement: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
type: :development
prerelease: false
version_requirements: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
description:
description: charlock_holmes provides binary and text detection as well as text transcoding
using libicu
email: seniorlopez@gmail.com
executables: []
extensions:
......@@ -69,7 +63,6 @@ extra_rdoc_files: []
files:
- .gitignore
- Gemfile
- Gemfile.lock
- MIT-LICENSE
- README.md
- Rakefile
......@@ -81,8 +74,6 @@ files:
- ext/charlock_holmes/encoding_detector.c
- ext/charlock_holmes/ext.c
- ext/charlock_holmes/extconf.rb
- ext/charlock_holmes/src/file-5.08.tar.gz
- ext/charlock_holmes/src/file-soft-check.patch
- ext/charlock_holmes/transliterator.cpp
- lib/charlock_holmes.rb
- lib/charlock_holmes/encoding_detector.rb
......@@ -91,38 +82,51 @@ files:
- test/converter_test.rb
- test/encoding_detector_test.rb
- test/fixtures/AnsiGraph.psm1
- test/fixtures/ISO-2022-KR.txt
- test/fixtures/TwigExtensionsDate.es.yml
- test/fixtures/cl-messagepack.lisp
- test/fixtures/core.rkt
- test/fixtures/foo.pdf
- test/fixtures/hello_world
- test/fixtures/laholator.py
- test/fixtures/octocat.ai
- test/fixtures/octocat.gif
- test/fixtures/octocat.jpg
- test/fixtures/octocat.png
- test/fixtures/octocat.psd
- test/fixtures/repl2.cljs
- test/fixtures/sierpinski.ps
- test/fixtures/utf16be.html
- test/fixtures/utf32be.html
- test/fixtures/utf32le.html
- test/fixtures/utf8.html
- test/fixtures/vimrc
- test/helper.rb
- test/string_methods_test.rb
- test/transliterator_test.rb
homepage: http://github.com/brianmario/charlock_holmes
licenses: []
homepage: https://github.com/brianmario/charlock_holmes
licenses:
- MIT
metadata: {}
post_install_message:
rdoc_options:
- --charset=UTF-8
require_paths:
- lib
required_ruby_version: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
required_rubygems_version: !ruby/object:Gem::Requirement
none: false
requirements:
- - ! '>='
- - '>='
- !ruby/object:Gem::Version
version: '0'
requirements: []
rubyforge_project:
rubygems_version: 1.8.23
rubygems_version: 2.0.3
signing_key:
specification_version: 3
specification_version: 4
summary: Character encoding detection, brought to you by ICU
test_files: []
# encoding: utf-8
require File.expand_path("../helper", __FILE__)
class ConverterTest < MiniTest::Unit::TestCase
class ConverterTest < MiniTest::Test
def test_convert_ascii_from_iso859_1_to_utf16_and_back
input = 'test'
......
# encoding: utf-8
require File.expand_path("../helper", __FILE__)
class EncodingDetectorTest < MiniTest::Unit::TestCase
class EncodingDetectorTest < MiniTest::Test
def setup
@detector = CharlockHolmes::EncodingDetector.new
end
......@@ -87,24 +87,50 @@ class EncodingDetectorTest < MiniTest::Unit::TestCase
assert supported_encodings.is_a?(Array)
assert supported_encodings.include? 'UTF-8'
assert supported_encodings.include? 'windows-1250'
assert supported_encodings.include? 'windows-1252'
assert supported_encodings.include? 'windows-1253'
assert supported_encodings.include? 'windows-1254'
assert supported_encodings.include? 'windows-1255'
end
def test_returns_a_ruby_compatible_encoding_name
detected = @detector.detect 'test'
assert_equal 'ISO-8859-1', detected[:encoding]
assert_equal 'ISO-8859-1', detected[:ruby_encoding]
not_compat_txt = fixture("ISO-2022-KR.txt").read
detected = @detector.detect not_compat_txt
assert_equal 'ISO-2022-KR', detected[:encoding]
assert_equal 'binary', detected[:ruby_encoding]
end
MAPPING = [
['repl2.cljs', 'ISO-8859-1', :text],
['core.rkt', 'UTF-8', :text],
['cl-messagepack.lisp', 'ISO-8859-1', :text],
['sierpinski.ps', 'ISO-8859-1', :text],
['core.rkt', 'UTF-8', :text],
['TwigExtensionsDate.es.yml', 'UTF-8', :text],
['AnsiGraph.psm1', 'UTF-16LE', :text],
['laholator.py', 'UTF-8', :text],
['hello_world', nil, :binary]
['vimrc', 'UTF-8', :text],
['AnsiGraph.psm1', 'UTF-16LE', :text],
['utf16be.html', 'UTF-16BE', :text],
['utf32le.html', 'UTF-32LE', :text],
['utf32be.html', 'UTF-32BE', :text],
['hello_world', nil, :binary],
['octocat.png', nil, :binary],
['octocat.jpg', nil, :binary],
['octocat.psd', nil, :binary],
['octocat.gif', nil, :binary],
['octocat.ai', nil, :binary],
['foo.pdf', nil, :binary],
]
def test_detection_works_as_expected
MAPPING.each do |mapping|
file, encoding, type = mapping
path = File.expand_path "../fixtures/#{file}", __FILE__
content = File.read path
content = fixture(file).read
guessed = @detector.detect content
assert_equal encoding, guessed[:encoding]
......
$)C#
# Out-AnsiGraph.psm1
# Author: xcud
# History:
# v0.1 September 21, 2009 initial version
#
# PS Example> ps | select -first 5 | sort -property VM |
# Out-AnsiGraph ProcessName, VM
# AEADISRV  14508032
# audiodg  50757632
# conhost  73740288
# AppleMobileDeviceService  92061696
# btdna  126443520
#
function Out-AnsiGraph($Parameter1=$null) {
BEGIN {
$q = new-object Collections.queue
$max = 0; $namewidth = 0;
}
PROCESS {
if($_) {
$name = $_.($Parameter1[0]);
$val = $_.($Parameter1[1])
if($max -lt $val) { $max = $val}
if($namewidth -lt $name.length) {
$namewidth = $name.length }
$q.enqueue(@($name, $val))
}
}
END {
$q | %{
$graph = ""; 0..($_[1]/$max*20) |
%{ $graph += "" }
$name = "{0,$namewidth}" -f $_[0]
"$name $graph " + $_[1]
}
}
}
Export-ModuleMember Out-AnsiGraph
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# @author: starenka
# @email: 'moc]tod[liamg].T.E[0aknerats'[::-1]
import warnings, hashlib, simplejson, string
from os.path import dirname, abspath
from flask import Flask, render_template, request
from flaskext.sqlalchemy import SQLAlchemy
try:
from sqlalchemy.exceptions import IntegrityError
except ImportError:
from sqlalchemy.exc import IntegrityError
#Hey monkey patcher! NLTK's NgramModel is not serializable w/ pickle.HIGHEST_PROTOCOL (2)
from werkzeug.contrib import cache
cache.HIGHEST_PROTOCOL = 1
from werkzeug.contrib.cache import SimpleCache
from BeautifulSoup import BeautifulSoup
import nltk
PUNCT = list(unicode(string.punctuation))
app = Flask(__name__)
app.config.from_object('settings')
cache = SimpleCache()
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///%s/db.sqlite3'%abspath(dirname(__file__))
db = SQLAlchemy(app)
class Sample(db.Model):
id = db.Column(db.Integer, primary_key=True)
url = db.Column(db.String(80), unique=True)
text = db.Column(db.String())
enabled = db.Column(db.Boolean())
def __unicode__(self):
str = unicode(BeautifulSoup(self.text,convertEntities=BeautifulSoup.HTML_ENTITIES))
return nltk.clean_html(str)
@classmethod
def get_all(self):
cached = cache.get('samples')
if cached is None:
cached = self.query.filter_by(enabled=True).all()
cache.set('samples', cached, timeout=app.config['CACHE_MINUTES'] * 60)
return cached
class Output(db.Model):
id = db.Column(db.Integer, primary_key=True)
hash = db.Column(db.String(128),unique=True)
text = db.Column(db.String())
params = db.Column(db.String(100))
def __init__(self,text,**params):
self.hash = hashlib.sha512(text.encode('utf8')).hexdigest()
self.text = text
self.params = simplejson.dumps(params)
@app.context_processor
def base_context():
return dict(settings=app.config,
hits = Output.query.count() + app.config['INIT_HITS']
)
@app.errorhandler(404)
def page_not_found(error):
return render_template('404.html',title=u"To tady nemáme!"), 404
@app.route('/faq')
def faq():
return render_template('faq.html',title=u"Často kladené dotazy",samples=Sample.get_all())
@app.route('/permalink/<hash>')
def permalink(hash):