diff --git a/.simplecov b/.simplecov index 5c83d9307..c6af845a4 100644 --- a/.simplecov +++ b/.simplecov @@ -1,5 +1,4 @@ require "base64" -require "simplecov-inline-html" require "term/ansicolor" SimpleCov.start do @@ -57,89 +56,95 @@ class SimpleCov::Formatter::SummaryFormatter end end -class SimpleCov::Formatter::CustomHtmlFormatter < SimpleCov::Formatter::InlineHTMLFormatter - def output_message(result) - end +begin + require "simplecov-inline-html" + + class SimpleCov::Formatter::CustomHtmlFormatter < SimpleCov::Formatter::InlineHTMLFormatter + def output_message(result) + end - # This image is missing from simplecov-inline-html - BORDER = <<-EOF.gsub(/\s/, '') - iVBORw0KGgoAAAANSUhEUgAAAAEAAABLCAIAAAA+tHrUAAAAUklEQVR4AcXH - sQ3AIAwFUfuDYADSMQkrMiVdqKGzLDnCRTaIUjzpjnvvodaKtRbM7KWqTkT4 - PBF9Yu+NMQauUvx/Eu45Q0oJOWd3OsbomBmttQc0NiOlCz/4pgAAAABJRU5E - rkJggg== - EOF + # This image is missing from simplecov-inline-html + BORDER = <<-EOF.gsub(/\s/, '') + iVBORw0KGgoAAAANSUhEUgAAAAEAAABLCAIAAAA+tHrUAAAAUklEQVR4AcXH + sQ3AIAwFUfuDYADSMQkrMiVdqKGzLDnCRTaIUjzpjnvvodaKtRbM7KWqTkT4 + PBF9Yu+NMQauUvx/Eu45Q0oJOWd3OsbomBmttQc0NiOlCz/4pgAAAABJRU5E + rkJggg== + EOF - # This image is missing from simplecov-inline-html - CONTROLS = <<-EOF.gsub(/\s/, '') - iVBORw0KGgoAAAANSUhEUgAAAOEAAABLCAMAAACx6hDAAAABj1BMVEVPT0/e - 3t7b29vS0tK7urq5uLjq6uqZmZmSkpJaWlrU1NTj4+PFxcWvr6+goKBbW1u3 - t7c9PT27u7vCwsKsrKxiYWGqqqq5ublbWlpeXV2Xl5fExMSbmpq6ubmNjY18 - fHzy8vIrKystLS0sLCxNTU0uLi4wMDDNzc05OTns6+vl5eUvLy/q6ekqKipM - TExDQ0M4ODgyMjI2NjbZ2dk6OjrY2NjMzMxLS0vAwMBCQkLo5+dHR0cxMTFK - SkpBQUHv7u43NzdISEhFRUVRUVHx8fE7Ozs8PDwzMzNJSUnp6elGRkZQUFDr - 6upeXl7t7e1gYGCoqKjv7+81NTWKiorn5uZERESCgoJdXV3p6OhOTk51dXVA - QEA+Pj6np6fu7e2+vr5cXFxSUlKJiYnOzs7s7OxTU1P29vbw8PB2dnZfX1/m - 5eV4eHifn59qamqmpqbQ0NCOjo7Kysqzs7P4+PiDg4Otra3z8/M/Pz80NDSr - q6u/v7/Pz890dHRpaWmBgYH5+fn08/NoaGjPzs7///+ioqIRuwm9AAAGF0lE - QVR4AcTWd1PDNhQAcH8UDcszdvaOs0lCSNiQsCmjhe7d654fvE+y8QjGXAq+ - vj+iu+enO/0iPdnSwdrx18f/YxzsfDDYkqrNtvp3Ynw1LrWbVWlrcCMdfLaz - XkSFh/rms1Vs6nFry+jBnEy0/DAWs414dPzxvwphynoRES6HBlpdWQOVh42Y - 9RpetjEsou3QfwQPlnH/yO9XPD78wxs77yMcxMdLQh19vo9Wl7dE+zWUWQUa - UqECRA6sFE5DRCj/Gs2eC/XvMaUUX33jjejNQi7Zeilc56rwEh1bVozQsr7k - hCgQ4y4QObCLLSCGy39DlzFCSgihIBTjm4VcJyWFYEaEjT1jQSmOEWJMfzX2 - Qt01AyClRFOHmaGqEZj1C8qEyxfGXuOZkDDGCAjF+Eah8J3Iss2jGg1bhCwD - EoiBENaaI4zQGCGF/KAS6i69XMCQZCUVqSUGk3ChPIuU9yvDzVSF3Gc7k14z - Pnq9iWPLnOgL79AuYaZpMiRi2zu2IgjP5y5QcHvC0aRQrYx2Rgo8o3Bggz9E - lNMLdJeekANtpzc9ytbbsVHPHk17VSD6wo7xAEBFUcz7+XxxfWpkxGH883ox - n9+LPCPHQXdB9+WIqbR4KCbJQUcGQq/8AXVWhaYnNN9BKDvN75Jm/Fjvu0QQ - igt/wIHugvmp+0nni6pZVKRFnpFQdwFRM1t5CJighYAgfCofGB810hGKLez9 - kzzl03Z/YnvCJarxQ6fwFbtEXHOFWKS9vDiMhz5R3Wjlx+NxvrWhAjAs9Mtr - 8CAQMk/I3kMoO9MvkqfcnmebjuwK9aKFCQMgrDcfFRIX7uYJtoJezKBHT/gJ - 3KNhYai8rKckhC3M/pw8RS2d9R1Z8vcQKCCJ38M8j5g9VMQDZXUPRdbfw3RO - 6Um1Wb99RZg7P5rYILx56kO/33gf1qJ9KODRPlRL/k2jqdE+9MoHxdT6UDqp - TtuvCCuFUZY3ohCKuxSIobt0FrlLW/CAfBu5SzWgu2EyLXqXivIU71J+0Tj9 - s9eE+6N6z+ZTEt6HHSRC5LXdyPswRxikNx7hBzY9F34finKym9770BX+sKaQ - f9NoLOGbRg19pOjFLiYQmopUjUDgbnHlm+bf9u33tWkgjAP4VvUak251/v79 - RCashUBhTGQV1zBtVLoX3dy0s0OmDETfGmhfCv3HzZPlSp7suKPFcL153zfZ - 2BH2IT/unnAPDi9ZuKEQjrtcqF6XrgvXpevJlQuSq4mlwhc/yA8Xr0sTOQrx - qEU4W23Ry9cWzwq1xSd1baFHSOtDWcGH5VM4rQ/DeepDDULMh7lq/Bfz1vga - hPq/05Qr1BArtML/UGjfNIO3TDBbeMLZgk1nC4/OFkw8W0yWMZM/2bGjacav - xYIZP964OOMzJ65kM34l7iNRMhzTnABm+Xd21LRqWwEQCAFuIIECAZCYAgGQ - mBu+4n9dRGF7n1UBxEJYZWTljcCUGKRAJAb54VW23144If9fxUJwKvnqycOh - SPQr2U/eHhn+EYeXLHzkS3OrIDzza5BG8EX43FDLV8AcVoWMmquAIU3NPytZ - ePuKNHepsMP6cB6nmqTPgvRmXMVfHMCQp6vNiVNguyiEvt8pV3j1lzTXckJ8 - 4TtA4qZfolwgwaeLEimQCsHxfrQXRDjw3RgUQk45pEQKpEKIXf4H3cKmB6AQ - 8uS/COMzSN+jRIjxmpf+Gh5e7ueQ6X0OFe/SPfPfpf9oPozp7UvnQ73C2dc0 - g9nXNFqF5q9L1UJ9tYUGoaI+dP1Af304u1B/ja9DqOs7TflCDTFCaIVWaIVW - aIVWaIVWaIVWaIVDlfAgMlTI95eqhPc2oxYKvxkp7OEO2pvyUz8JcY+wsULc - 5+3IT/20MeL7vE0ULj1/vzt8LDvzwzDaxr36xgqTreytk607Pz+f1u9fL6Re - P31wEkZD7LcwVJjepq9ftUZRY3P87mLGB2EjGh4ll9Bg4RISj3ZGa93uVjHd - brQ22uFAI4WK1i7s69rFxq7j3htzhfL2vKQ37yU25+ElNFJIWiwxgv7KY95g - aaaQG3vyHlkEmitEI0bV52yikOe7KnygYcK/0sK5wtrjpuMAAAAASUVORK5C - YII= - EOF + # This image is missing from simplecov-inline-html + CONTROLS = <<-EOF.gsub(/\s/, '') + iVBORw0KGgoAAAANSUhEUgAAAOEAAABLCAMAAACx6hDAAAABj1BMVEVPT0/e + 3t7b29vS0tK7urq5uLjq6uqZmZmSkpJaWlrU1NTj4+PFxcWvr6+goKBbW1u3 + t7c9PT27u7vCwsKsrKxiYWGqqqq5ublbWlpeXV2Xl5fExMSbmpq6ubmNjY18 + fHzy8vIrKystLS0sLCxNTU0uLi4wMDDNzc05OTns6+vl5eUvLy/q6ekqKipM + TExDQ0M4ODgyMjI2NjbZ2dk6OjrY2NjMzMxLS0vAwMBCQkLo5+dHR0cxMTFK + SkpBQUHv7u43NzdISEhFRUVRUVHx8fE7Ozs8PDwzMzNJSUnp6elGRkZQUFDr + 6upeXl7t7e1gYGCoqKjv7+81NTWKiorn5uZERESCgoJdXV3p6OhOTk51dXVA + QEA+Pj6np6fu7e2+vr5cXFxSUlKJiYnOzs7s7OxTU1P29vbw8PB2dnZfX1/m + 5eV4eHifn59qamqmpqbQ0NCOjo7Kysqzs7P4+PiDg4Otra3z8/M/Pz80NDSr + q6u/v7/Pz890dHRpaWmBgYH5+fn08/NoaGjPzs7///+ioqIRuwm9AAAGF0lE + QVR4AcTWd1PDNhQAcH8UDcszdvaOs0lCSNiQsCmjhe7d654fvE+y8QjGXAq+ + vj+iu+enO/0iPdnSwdrx18f/YxzsfDDYkqrNtvp3Ynw1LrWbVWlrcCMdfLaz + XkSFh/rms1Vs6nFry+jBnEy0/DAWs414dPzxvwphynoRES6HBlpdWQOVh42Y + 9RpetjEsou3QfwQPlnH/yO9XPD78wxs77yMcxMdLQh19vo9Wl7dE+zWUWQUa + UqECRA6sFE5DRCj/Gs2eC/XvMaUUX33jjejNQi7Zeilc56rwEh1bVozQsr7k + hCgQ4y4QObCLLSCGy39DlzFCSgihIBTjm4VcJyWFYEaEjT1jQSmOEWJMfzX2 + Qt01AyClRFOHmaGqEZj1C8qEyxfGXuOZkDDGCAjF+Eah8J3Iss2jGg1bhCwD + EoiBENaaI4zQGCGF/KAS6i69XMCQZCUVqSUGk3ChPIuU9yvDzVSF3Gc7k14z + Pnq9iWPLnOgL79AuYaZpMiRi2zu2IgjP5y5QcHvC0aRQrYx2Rgo8o3Bggz9E + lNMLdJeekANtpzc9ytbbsVHPHk17VSD6wo7xAEBFUcz7+XxxfWpkxGH883ox + n9+LPCPHQXdB9+WIqbR4KCbJQUcGQq/8AXVWhaYnNN9BKDvN75Jm/Fjvu0QQ + igt/wIHugvmp+0nni6pZVKRFnpFQdwFRM1t5CJighYAgfCofGB810hGKLez9 + kzzl03Z/YnvCJarxQ6fwFbtEXHOFWKS9vDiMhz5R3Wjlx+NxvrWhAjAs9Mtr + 8CAQMk/I3kMoO9MvkqfcnmebjuwK9aKFCQMgrDcfFRIX7uYJtoJezKBHT/gJ + 3KNhYai8rKckhC3M/pw8RS2d9R1Z8vcQKCCJ38M8j5g9VMQDZXUPRdbfw3RO + 6Um1Wb99RZg7P5rYILx56kO/33gf1qJ9KODRPlRL/k2jqdE+9MoHxdT6UDqp + TtuvCCuFUZY3ohCKuxSIobt0FrlLW/CAfBu5SzWgu2EyLXqXivIU71J+0Tj9 + s9eE+6N6z+ZTEt6HHSRC5LXdyPswRxikNx7hBzY9F34finKym9770BX+sKaQ + f9NoLOGbRg19pOjFLiYQmopUjUDgbnHlm+bf9u33tWkgjAP4VvUak251/v79 + RCashUBhTGQV1zBtVLoX3dy0s0OmDETfGmhfCv3HzZPlSp7suKPFcL153zfZ + 2BH2IT/unnAPDi9ZuKEQjrtcqF6XrgvXpevJlQuSq4mlwhc/yA8Xr0sTOQrx + qEU4W23Ry9cWzwq1xSd1baFHSOtDWcGH5VM4rQ/DeepDDULMh7lq/Bfz1vga + hPq/05Qr1BArtML/UGjfNIO3TDBbeMLZgk1nC4/OFkw8W0yWMZM/2bGjacav + xYIZP964OOMzJ65kM34l7iNRMhzTnABm+Xd21LRqWwEQCAFuIIECAZCYAgGQ + mBu+4n9dRGF7n1UBxEJYZWTljcCUGKRAJAb54VW23144If9fxUJwKvnqycOh + SPQr2U/eHhn+EYeXLHzkS3OrIDzza5BG8EX43FDLV8AcVoWMmquAIU3NPytZ + ePuKNHepsMP6cB6nmqTPgvRmXMVfHMCQp6vNiVNguyiEvt8pV3j1lzTXckJ8 + 4TtA4qZfolwgwaeLEimQCsHxfrQXRDjw3RgUQk45pEQKpEKIXf4H3cKmB6AQ + 8uS/COMzSN+jRIjxmpf+Gh5e7ueQ6X0OFe/SPfPfpf9oPozp7UvnQ73C2dc0 + g9nXNFqF5q9L1UJ9tYUGoaI+dP1Af304u1B/ja9DqOs7TflCDTFCaIVWaIVW + aIVWaIVWaIVWaIVDlfAgMlTI95eqhPc2oxYKvxkp7OEO2pvyUz8JcY+wsULc + 5+3IT/20MeL7vE0ULj1/vzt8LDvzwzDaxr36xgqTreytk607Pz+f1u9fL6Re + P31wEkZD7LcwVJjepq9ftUZRY3P87mLGB2EjGh4ll9Bg4RISj3ZGa93uVjHd + brQ22uFAI4WK1i7s69rFxq7j3htzhfL2vKQ37yU25+ElNFJIWiwxgv7KY95g + aaaQG3vyHlkEmitEI0bV52yikOe7KnygYcK/0sK5wtrjpuMAAAAASUVORK5C + YII= + EOF - def file(path) - if path.end_with?("application.css") - css = super(path) - css.gsub!("url(colorbox/border.png)", "url(data:image/png;base64,#{BORDER})") - css.gsub!("url(colorbox/controls.png)", "url(data:image/png;base64,#{CONTROLS})") + def file(path) + if path.end_with?("application.css") + css = super(path) + css.gsub!("url(colorbox/border.png)", "url(data:image/png;base64,#{BORDER})") + css.gsub!("url(colorbox/controls.png)", "url(data:image/png;base64,#{CONTROLS})") - # Fit more stuff without scroll bars, don't linewrap filenames - css << <<-EOF - body { padding:0 !important; } - .src_link { white-space:nowrap; } - .file_list { font-size: .8rem; } - .ui-icon { width:0px !important; height:0px !important; } - th { font-size:.7rem; overflow:hidden; white-space:normal !important; } - EOF - else - super(path) + # Fit more stuff without scroll bars, don't linewrap filenames + css << <<-EOF + body { padding:0 !important; } + .src_link { white-space:nowrap; } + .file_list { font-size: .8rem; } + .ui-icon { width:0px !important; height:0px !important; } + th { font-size:.7rem; overflow:hidden; white-space:normal !important; } + EOF + else + super(path) + end end end -end -SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([ - SimpleCov::Formatter::SummaryFormatter, - SimpleCov::Formatter::CustomHtmlFormatter -]) + SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([ + SimpleCov::Formatter::SummaryFormatter, + SimpleCov::Formatter::CustomHtmlFormatter + ]) +rescue LoadError + SimpleCov.formatter = SimpleCov::Formatter::SummaryFormatter +end diff --git a/.travis.yml b/.travis.yml index 0e8dbcb86..46178b180 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,13 +9,14 @@ before_install: else gem update --system; gem install bundler; + bundle install; bundle update --bundler; fi os: - osx - linux - - windows + # windows rvm: # 2.0.0 diff --git a/Gemfile b/Gemfile index 34170ff1a..1cbbcb9a6 100644 --- a/Gemfile +++ b/Gemfile @@ -8,4 +8,14 @@ gemspec(path: "build/gemspec", name: "stupidedi-exts") group :development do gem "yard", "= 0.9.16" gem "rdiscount", "~> 2.2" + + gem "irb"# "~> 1.0" + gem "rake", "~> 12.3" + gem "rspec", "~> 3.8" + gem "rspec-collection_matchers"# " " + gem "stackprof", "~> 0.2" + gem "benchmark-ips"# "" + gem "simplecov"# "" + gem "simplecov-inline-html" if RUBY_VERSION >= "2.4" + gem "memory_profiler"# if RUBY_VERSION >= "2.4" end diff --git a/build/doc/lib/rspec-plugin/Rakefile b/build/doc/lib/rspec-plugin/Rakefile index 15b6054b8..1881c6f76 100644 --- a/build/doc/lib/rspec-plugin/Rakefile +++ b/build/doc/lib/rspec-plugin/Rakefile @@ -17,7 +17,7 @@ Rake::GemPackageTask.new(SPEC) do |pkg| end desc "Install the gem locally" -task :install => :package do +task :install => :package do sh "#{SUDO} gem install pkg/#{SPEC.name}-#{SPEC.version}.gem --local --no-rdoc --no-ri" sh "rm -rf pkg/yard-#{SPEC.version}" unless ENV['KEEP_FILES'] end diff --git a/build/doc/lib/rspec-plugin/example/example_code.rb b/build/doc/lib/rspec-plugin/example/example_code.rb index ae9efd69a..19f3afc47 100644 --- a/build/doc/lib/rspec-plugin/example/example_code.rb +++ b/build/doc/lib/rspec-plugin/example/example_code.rb @@ -8,9 +8,9 @@ def pig_latin end end -# +# # Specs -# +# describe String do describe '#pig_latin' do it "should be a pig!" do @@ -21,4 +21,4 @@ def pig_latin "hello".pig_latin.should == "hello" end end -end \ No newline at end of file +end diff --git a/build/doc/lib/rspec-plugin/lib/yard-rspec/handler.rb b/build/doc/lib/rspec-plugin/lib/yard-rspec/handler.rb index 848a02e3c..d1826fca7 100644 --- a/build/doc/lib/rspec-plugin/lib/yard-rspec/handler.rb +++ b/build/doc/lib/rspec-plugin/lib/yard-rspec/handler.rb @@ -1,6 +1,6 @@ class RSpecDescribeHandler < YARD::Handlers::Ruby::Base handles method_call(:describe) - + def process return unless statement.last.last @@ -51,7 +51,7 @@ class RSpecItHandler < YARD::Handlers::Ruby::Base handles method_call(:it) handles method_call(:its) handles method_call(:specify) - + def process return unless owner.is_a?(Hash) return unless owner[:describes] diff --git a/build/doc/lib/rspec-plugin/lib/yard-rspec/legacy.rb b/build/doc/lib/rspec-plugin/lib/yard-rspec/legacy.rb index 477075c23..1833d8496 100644 --- a/build/doc/lib/rspec-plugin/lib/yard-rspec/legacy.rb +++ b/build/doc/lib/rspec-plugin/lib/yard-rspec/legacy.rb @@ -2,7 +2,7 @@ class LegacyRSpecDescribeHandler < YARD::Handlers::Ruby::Legacy::Base # @todo deal with rspec metadata hash params MATCH = /\Adescribe\s+(.+?)\s+(?:do|\{)/ handles MATCH - + def process describes = statement.tokens.to_s[MATCH, 1].gsub(/["']/, '') @@ -42,7 +42,7 @@ class LegacyRSpecItHandler < YARD::Handlers::Ruby::Legacy::Base handles MATCH handles /\A(?:its?|specify)\s+(?:do|\{)/ - + def process return unless owner.is_a?(Hash) return unless owner[:describes] diff --git a/build/gemspec/stupidedi-core.gemspec b/build/gemspec/stupidedi-core.gemspec index d2266da1a..89b6b175e 100644 --- a/build/gemspec/stupidedi-core.gemspec +++ b/build/gemspec/stupidedi-core.gemspec @@ -34,15 +34,4 @@ Gem::Specification.new do |s| s.required_ruby_version = ">= 2.0.0" s.required_rubygems_version = ">= 2.5.0" s.requirements # << "" - - # Development dependencies - s.add_development_dependency "irb"# "~> 1.0" - s.add_development_dependency "rake", "~> 12.3" - s.add_development_dependency "rspec", "~> 3.8" - s.add_development_dependency "rspec-collection_matchers"# " " - s.add_development_dependency "simplecov"# "" - s.add_development_dependency "simplecov-inline-html"# "" # requires ruby 2.4+ - s.add_development_dependency "stackprof", "~> 0.2" - s.add_development_dependency "benchmark-ips"# "" - s.add_development_dependency "memory_profiler"# "" # requires ruby 2.3+ end diff --git a/ext/c/stupidedi/reader/extconf.rb b/ext/c/stupidedi/reader/extconf.rb index 55efc7543..012598cae 100644 --- a/ext/c/stupidedi/reader/extconf.rb +++ b/ext/c/stupidedi/reader/extconf.rb @@ -1,7 +1,8 @@ require "mkmf" -extension_name = "stupidedi/reader/native_ext" +$warnflags.gsub!(/-Wdeclaration-after-statement/, "") +$CFLAGS << " -std=c99" +extension_name = "stupidedi/reader/native_ext" dir_config extension_name create_makefile extension_name -create_header diff --git a/ext/c/stupidedi/reader/reader_ext.c b/ext/c/stupidedi/reader/native_ext.c similarity index 69% rename from ext/c/stupidedi/reader/reader_ext.c rename to ext/c/stupidedi/reader/native_ext.c index e3fe46bfa..1288d3f26 100644 --- a/ext/c/stupidedi/reader/reader_ext.c +++ b/ext/c/stupidedi/reader/native_ext.c @@ -1,9 +1,15 @@ -#include "extconf.h" #include "ruby.h" #include "ruby/encoding.h" #include "codepoints.h" #include +/* Note: + * + * documentation and specifications: + * https://github.com/ruby/ruby/blob/master/spec/ruby/optional/capi/string_spec.rb + * https://github.com/ruby/ruby/blob/master/spec/ruby/optional/capi/encoding_spec.rb + */ + extern VALUE rb_str_length(VALUE str); #define LIKELY(x) (x) @@ -18,6 +24,7 @@ extern VALUE rb_str_length(VALUE str); #endif #endif +/* static int ENCIDX_CP850 = -1; static int ENCIDX_CP852 = -1; static int ENCIDX_CP855 = -1; @@ -36,13 +43,10 @@ static int ENCIDX_IBM864 = -1; static int ENCIDX_IBM865 = -1; static int ENCIDX_IBM866 = -1; static int ENCIDX_IBM869 = -1; +*/ + +static int ENCIDX_US_ASCII = -1; static int ENCIDX_ISO_8859_1 = -1; -static int ENCIDX_ISO_8859_10 = -1; -static int ENCIDX_ISO_8859_11 = -1; -static int ENCIDX_ISO_8859_13 = -1; -static int ENCIDX_ISO_8859_14 = -1; -static int ENCIDX_ISO_8859_15 = -1; -static int ENCIDX_ISO_8859_16 = -1; static int ENCIDX_ISO_8859_2 = -1; static int ENCIDX_ISO_8859_3 = -1; static int ENCIDX_ISO_8859_4 = -1; @@ -51,15 +55,26 @@ static int ENCIDX_ISO_8859_6 = -1; static int ENCIDX_ISO_8859_7 = -1; static int ENCIDX_ISO_8859_8 = -1; static int ENCIDX_ISO_8859_9 = -1; +static int ENCIDX_ISO_8859_10 = -1; +static int ENCIDX_ISO_8859_11 = -1; +static int ENCIDX_ISO_8859_13 = -1; +static int ENCIDX_ISO_8859_14 = -1; +static int ENCIDX_ISO_8859_15 = -1; +static int ENCIDX_ISO_8859_16 = -1; + +/* static int ENCIDX_TIS_620 = -1; -static int ENCIDX_US_ASCII = -1; static int ENCIDX_UTF_16 = -1; static int ENCIDX_UTF_16BE = -1; static int ENCIDX_UTF_16LE = -1; static int ENCIDX_UTF_32 = -1; static int ENCIDX_UTF_32BE = -1; static int ENCIDX_UTF_32LE = -1; +*/ + static int ENCIDX_UTF_8 = -1; + +/* static int ENCIDX_Windows_1250 = -1; static int ENCIDX_Windows_1251 = -1; static int ENCIDX_Windows_1252 = -1; @@ -69,36 +84,60 @@ static int ENCIDX_Windows_1255 = -1; static int ENCIDX_Windows_1256 = -1; static int ENCIDX_Windows_1257 = -1; static int ENCIDX_Windows_1258 = -1; +*/ -static unsigned char encset[32]; +/* This keeps track of which which encidx-es we have seen */ +static unsigned char encdb[32]; static inline bool bitmask_test(unsigned char *bitmask, int bitidx, int bitmask_size) { - if (bitidx < 0 || bitidx >= 8*bitmask_size) + int last = (bitmask_size - 1) / 8; + int idx = last - bitidx / 8; + int bit = bitidx % 8; + + if (bitidx < 0 || bitmask_size <= bitidx) return false; - int n = bitmask_size - 1 - bitidx / 8; - int m = bitidx % 8; - char c = *(bitmask + n); - return (c >> m) & 0x1; + return (bitmask[idx] >> bit) & 0x1; } static void bitmask_set(unsigned char *bitmask, int bitidx, int bitmask_size) { - if (bitidx < 0 || bitidx >= 8*bitmask_size) + int last = (bitmask_size - 1) / 8; + int idx = last - bitidx / 8; + int bit = bitidx % 8; + + if (bitidx < 0 || bitmask_size <= bitidx) return; - int n = bitmask_size - 1 - bitidx / 8; - int m = bitidx % 8; - bitmask[n] |= (1 << m); + bitmask[idx] |= (1 << bit); } +/* + * Ideally we could write a switch(encidx) { ... } statement to handle each + * character encoding, but Ruby's public API doesn't export the encidx constants + * for each encoding. + * + * It does let us lookup the name for a given encidx, but testing the encoding + * name (encname == "US-ASCII") against multiple possible matches is expensive. + * Especially when the test is repeated for every input character, to determine + * if it's whitespace or a non-graphical character. + * + * This scheme only requires comparing the encoding name against known encodings + * on the first time we see this particular encidx; from then on, other code + * can just compare encidx with ENCIDX_xx. + * + * NOTE: We could enumerate all encodings in Init_native_ext and assigned each + * constant all at once, but many encodings are marked "autoload" and + * enumerating them that way would eagerly load many encodings that won't + * actually be used. + */ static bool -update_encidx(int encidx) { +update_encdb(int encidx) { const char *encname; /* We've already assigned the encidx to an ENCIDX_xx global */ - if (bitmask_test(encset, encidx, 32)) + if (bitmask_test(encdb, encidx, 256)) return false; /* Otherwise, match the "NAME" to the ENCIDX_NAME constant */ @@ -106,10 +145,13 @@ update_encidx(int encidx) { #define TESTENC(name, id) if (0 == strncmp(name,encname,64)) {\ ENCIDX_##id = encidx;\ - bitmask_set(encset, encidx, 32); \ + bitmask_set(encdb, encidx, 256); \ return true; \ } + TESTENC("US-ASCII", US_ASCII) + + /* TESTENC("CP850", CP850) // US-ASCII+ https://en.wikipedia.org/wiki/Code_page_850 TESTENC("CP852", CP852) // CP850+ https://en.wikipedia.org/wiki/Code_page_852 TESTENC("CP855", CP855) // CP850+ https://en.wikipedia.org/wiki/Code_page_855 @@ -129,6 +171,7 @@ update_encidx(int encidx) { TESTENC("IBM865", IBM865) // IBM437+ https://en.wikipedia.org/wiki/Code_page_865 TESTENC("IBM866", IBM866) // IBM437+ https://en.wikipedia.org/wiki/Code_page_866 TESTENC("IBM869", IBM869) // IBM437+ https://en.wikipedia.org/wiki/Code_page_869 + */ TESTENC("ISO-8859-1", ISO_8859_1) TESTENC("ISO-8859-2", ISO_8859_2) @@ -145,18 +188,23 @@ update_encidx(int encidx) { TESTENC("ISO-8859-14", ISO_8859_14) TESTENC("ISO-8859-15", ISO_8859_15) TESTENC("ISO-8859-16", ISO_8859_16) - TESTENC("TIS-620", TIS_620) // https://en.wikipedia.org/wiki/ISO/IEC_8859-11 - TESTENC("US-ASCII", US_ASCII) + /* + TESTENC("TIS-620", TIS_620) // https://en.wikipedia.org/wiki/ISO/IEC_8859-11 + */ TESTENC("UTF-8", UTF_8) + + /* TESTENC("UTF-16", UTF_16) TESTENC("UTF-16BE", UTF_16BE) TESTENC("UTF-16LE", UTF_16LE) TESTENC("UTF-32", UTF_32) TESTENC("UTF-32BE", UTF_32BE) TESTENC("UTF-32LE", UTF_32LE) + */ + /* TESTENC("Windows-1250", Windows_1250) TESTENC("Windows-1251", Windows_1251) TESTENC("Windows-1252", Windows_1252) @@ -166,6 +214,7 @@ update_encidx(int encidx) { TESTENC("Windows-1256", Windows_1256) TESTENC("Windows-1257", Windows_1257) TESTENC("Windows-1258", Windows_1258) + */ return false; } @@ -185,10 +234,10 @@ update_encidx(int encidx) { * This means for around 700 intervals (the number of codepoint ranges that * cover Unicode graphical characters), about 10 iterations are required. * - * TODO: Using an optimal binary tree might reduce the number of iterations, - * but would increase the complexity -- using a contiguous region of memory - * like an array provides good data locality, but some scheme would be needed - * to represent a non-complete binary tree. The best approach might be to + * TODO: Using an optimal binary tree might reduce the average number of + * iterations, but it would increase the complexity -- using a contiguous region + * of memory like an array provides good data locality, but some scheme would be + * needed to represent a non-complete binary tree. The best approach might be to * allocate a contiguous block of memory and then use a linked representation. * But reducing most queries from 10 iterations to 1-2 might not improve much? */ @@ -198,13 +247,12 @@ has_matching_interval(const unsigned int point, const unsigned int *max, const unsigned int size) { - int k, l, r, z; - l = 0; - r = size - 1; - z = -1; + int k, + l = 0, + r = size - 1, + z = -1; - for (l = 0, r = size - 1, z = -1; - k = (l + r) / 2, l <= r;) { + for (l = 0, r = size - 1, z = -1; k = (l + r) / 2, l <= r;) { if (UNLIKELY(point > min[k])) l = (z = k) + 1; // descend right else if (point < min[k]) @@ -222,18 +270,18 @@ has_matching_interval(const unsigned int point, return false; } -// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt - static inline bool is_whitespace(const unsigned int c, const int encidx) { if (encidx == ENCIDX_US_ASCII) - return (c >= 0x08 && c <= 0x0d) || c == 0x20; + return (0x08 <= c && c <= 0x0d) || c == 0x20; + else if (encidx == ENCIDX_UTF_8) return has_matching_interval(c, ucs_codepoints_whitespace_min, ucs_codepoints_whitespace_max, ucs_codepoints_whitespace_count); + else if (encidx == ENCIDX_ISO_8859_1 || encidx == ENCIDX_ISO_8859_2 || encidx == ENCIDX_ISO_8859_3 || @@ -249,11 +297,11 @@ is_whitespace(const unsigned int c, const int encidx) encidx == ENCIDX_ISO_8859_14 || encidx == ENCIDX_ISO_8859_15 || encidx == ENCIDX_ISO_8859_16) - return (c >= 0x08 && c <= 0x0d) || c == 0x20 || c == 0xa0; + return (0x08 <= c && c <= 0x0d) || c == 0x20 || c == 0xa0; /* If nothing matched, it could be the first time we've seen this encoding * and we haven't assigned ENCIDX_XX yet. If so, update and retry */ - if (update_encidx(encidx)) + if (update_encdb(encidx)) return is_whitespace(c, encidx); rb_raise(rb_eEncCompatError, "unsupported encoding: %s", @@ -262,9 +310,6 @@ is_whitespace(const unsigned int c, const int encidx) /* * Letters, punctuation, symbols, ... have a visual representation - * - * Not control character (e.g., <= 0x1f in US-ASCII) - * Not undefined character (e.g., > 0x7f in US-ASCII) */ static bool is_graphic(const unsigned int c, const int encidx) @@ -279,7 +324,7 @@ is_graphic(const unsigned int c, const int encidx) ucs_codepoints_graphic_count); if (encidx == ENCIDX_US_ASCII) - return (c >= 0x20 && c <= 0x7f); + return (0x20 <= c && c <= 0x7f); if (encidx == ENCIDX_ISO_8859_1 || encidx == ENCIDX_ISO_8859_2 || @@ -296,44 +341,40 @@ is_graphic(const unsigned int c, const int encidx) encidx == ENCIDX_ISO_8859_14 || encidx == ENCIDX_ISO_8859_15 || encidx == ENCIDX_ISO_8859_16) - if (c >= 0x20 && c <= 0x7f) + if (0x20 <= c && c <= 0x7f) return true; - if (encidx == ENCIDX_ISO_8859_1) return bitmask_test(iso_8859_graphic[0], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_2) return bitmask_test(iso_8859_graphic[1], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_3) return bitmask_test(iso_8859_graphic[2], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_4) return bitmask_test(iso_8859_graphic[3], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_5) return bitmask_test(iso_8859_graphic[4], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_6) return bitmask_test(iso_8859_graphic[5], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_7) return bitmask_test(iso_8859_graphic[6], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_8) return bitmask_test(iso_8859_graphic[7], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_9) return bitmask_test(iso_8859_graphic[8], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_10) return bitmask_test(iso_8859_graphic[9], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_11) return bitmask_test(iso_8859_graphic[10], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_13) return bitmask_test(iso_8859_graphic[12], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_14) return bitmask_test(iso_8859_graphic[13], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_15) return bitmask_test(iso_8859_graphic[14], c-0xa0, 12); - else if (encidx == ENCIDX_ISO_8859_16) return bitmask_test(iso_8859_graphic[15], c-0xa0, 12); + if (encidx == ENCIDX_ISO_8859_1) return bitmask_test(iso_8859_graphic[0], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_2) return bitmask_test(iso_8859_graphic[1], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_3) return bitmask_test(iso_8859_graphic[2], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_4) return bitmask_test(iso_8859_graphic[3], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_5) return bitmask_test(iso_8859_graphic[4], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_6) return bitmask_test(iso_8859_graphic[5], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_7) return bitmask_test(iso_8859_graphic[6], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_8) return bitmask_test(iso_8859_graphic[7], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_9) return bitmask_test(iso_8859_graphic[8], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_10) return bitmask_test(iso_8859_graphic[9], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_11) return bitmask_test(iso_8859_graphic[10], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_13) return bitmask_test(iso_8859_graphic[12], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_14) return bitmask_test(iso_8859_graphic[13], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_15) return bitmask_test(iso_8859_graphic[14], c-0xa0, 96); + else if (encidx == ENCIDX_ISO_8859_16) return bitmask_test(iso_8859_graphic[15], c-0xa0, 96); /* If nothing matched, it could be the first time we've seen this encoding * and we haven't assigned ENCIDX_XX yet. If so, update and retry */ - if (update_encidx(encidx)) + if (update_encdb(encidx)) return is_graphic(c, encidx); rb_raise(rb_eEncCompatError, "unsupported encoding: %s", rb_enc_name(rb_enc_from_index(encidx))); } +/* True if each character in the string is a single byte */ static inline bool single_byte_optimizable(VALUE str, rb_encoding *enc) { - if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) - return true; - - if (rb_enc_mbmaxlen(enc) == 1) - return true; - - return false; + return ENC_CODERANGE(str) == ENC_CODERANGE_7BIT + || rb_enc_mbmaxlen(enc) == 1; } /* @@ -371,20 +412,21 @@ rb_substr_eq_p(VALUE self, VALUE str1, VALUE _index1, VALUE str2, VALUE _index2, rb_enc_name(rb_enc_get(str1)), rb_enc_name(rb_enc_get(str2))); - if (idx1 + len > rb_str_strlen(str1)) return Qnil; - if (idx2 + len > rb_str_strlen(str2)) return Qnil; + if (rb_str_strlen(str1) < idx1 + len) return Qnil; + if (rb_str_strlen(str2) < idx2 + len) return Qnil; - /* Number of bytes in str[idx, len], calculated by rb_str_subpos */ long len1, len2; len1 = len; len2 = len; + /* Request len _characters_ str[idx,len]. Then len1, len2 will be number of _bytes_ */ const char *ptr1, *ptr2; ptr1 = rb_str_subpos(str1, idx1, &len1); ptr2 = rb_str_subpos(str2, idx2, &len2); - if (ptr1 == NULL || ptr2 == NULL) return Qnil; + /* Number of bytes in str1[idx1,len] isn't equal to str2[idx2,len] */ if (len1 != len2 || len1 < len) return Qnil; + if (ptr1 == NULL || ptr2 == NULL) return Qnil; return (memcmp(ptr1, ptr2, len) == 0) ? Qtrue : Qfalse; } @@ -402,35 +444,33 @@ rb_graphic_p(int argc, const VALUE *argv, VALUE self) { rb_check_arity(argc, 1, 2); - Check_Type(argv[0], T_STRING); + Check_Type(argv[0], T_STRING); if (argc == 2) Check_Type(argv[1], T_FIXNUM); - VALUE str; - int encidx, len_; long len, idx; - char *ptr, *end; - rb_encoding *enc; - unsigned int chr; + len = 1; + idx = argc < 2 ? 0 : FIX2LONG(argv[1]); - str = argv[0]; - ptr = RSTRING_PTR(str); + VALUE str = argv[0]; + char *ptr, *end; end = RSTRING_END(str); + ptr = rb_str_subpos(str, idx, &len); /* address of str[idx], len is .bytesize */ + + int encidx, len_; encidx = ENCODING_GET(str); - idx = (argc == 1) ? 0 : FIX2LONG(argv[1]); - len = 1; - // Skip ahead to idx - ptr = rb_str_subpos(str, idx, &len); + rb_encoding *enc; + unsigned int chr; - if (!ptr || !len || ptr >= end) + if (ptr == NULL || len == 0) return Qnil; - enc = rb_enc_from_index(encidx); len_ = 1; + enc = rb_enc_from_index(encidx); chr = rb_enc_codepoint_len(ptr, end, &len_, enc); - if (!len_) + if (len_ == 0) return Qnil; return is_graphic(chr, encidx) ? Qtrue : Qfalse; @@ -450,120 +490,130 @@ rb_whitespace_p(int argc, const VALUE *argv, VALUE self) { rb_check_arity(argc, 1, 2); - Check_Type(argv[0], T_STRING); + Check_Type(argv[0], T_STRING); if (argc == 2) Check_Type(argv[1], T_FIXNUM); - VALUE str; - int encidx, len_; long len, idx; - char *ptr, *end; - rb_encoding *enc; - unsigned int chr; + len = 1; + idx = argc < 2 ? 0 : FIX2LONG(argv[1]); - str = argv[0]; - ptr = RSTRING_PTR(str); + VALUE str = argv[0]; + char *ptr, *end; end = RSTRING_END(str); + ptr = rb_str_subpos(str, idx, &len); /* address of str[idx], len is .bytesize */ + + int encidx, len_; encidx = ENCODING_GET(str); - idx = (argc == 1) ? 0 : FIX2LONG(argv[1]); - len = 1; - // Skip ahead to idx - ptr = rb_str_subpos(str, idx, &len); + rb_encoding *enc; + unsigned int chr; - if (!ptr || !len || ptr >= end) + if (ptr == NULL || len == 0) return Qnil; - enc = rb_enc_from_index(encidx); len_ = 1; + enc = rb_enc_from_index(encidx); chr = rb_enc_codepoint_len(ptr, end, &len_, enc); - if (!len_) + if (len_ == 0) return Qnil; - return is_whitespace(chr, encidx) ? Qtrue : Qfalse; + return is_graphic(chr, encidx) ? Qtrue : Qfalse; } /* * call-seq: * min_graphic_index(string, index=0) -> int * - * Description + * Returns the smallest index (starting from the given index) that is a graphic + * character. If no graphic characters occur after the given index, then the + * string length is returned. * * min_graphic_index("\r\nabc ") #=> 2 * min_graphic_index("\r\nabc ", 2) #=> 2 * min_graphic_index("\r\nabc ", 5) #=> 5 + * min_graphic_index("\r\n") #=> 2 */ static VALUE -rb_min_graphic_index(int argc, const VALUE *argv, VALUE self) { +rb_min_graphic_index(int argc, const VALUE *argv, VALUE self) +{ rb_check_arity(argc, 1, 2); Check_Type(argv[0], T_STRING); - if (argc == 2) + if (argc >= 2) Check_Type(argv[1], T_FIXNUM); VALUE str; - int encidx; - long idx; - char *ptr, *end; - rb_encoding *enc; - str = argv[0]; + + char *ptr, *end; end = RSTRING_END(str); ptr = RSTRING_PTR(str); - idx = (argc == 1) ? 0 : FIX2LONG(argv[1]); + long idx; + idx = argc < 2 ? 0 : FIX2LONG(argv[1]); + + int encidx; encidx = ENCODING_GET(str); + + rb_encoding *enc; enc = rb_enc_from_index(encidx); - if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative"); - if (!ptr) return INT2FIX(0); + if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative"); + if (ptr == NULL) return INT2FIX(0); if (single_byte_optimizable(str, enc)) { - ptr += idx; + ptr += idx; /* address of str[idx] */ - if (ptr >= end) + if (end <= ptr) return LONG2NUM(RSTRING_LEN(str)); while (ptr < end && !is_graphic(*ptr, encidx)) ptr ++; return LONG2NUM(ptr - RSTRING_PTR(str)); - } + } else { + long len_, count; + len_ = 1; + count = 0; + + /* address of str[idx], len is .bytesize */ + ptr = rb_str_subpos(str, idx, &len_); + if (ptr == NULL) return rb_str_length(str); - int len; - long len_, count; - unsigned int c; + unsigned int c; + int len; - len_ = 1; - count = 0; - ptr = rb_str_subpos(str, idx, &len_); + while (ptr < end) { + c = rb_enc_codepoint_len(ptr, end, &len, enc); - if (!ptr || ptr >= end) - return rb_str_length(str); + if (is_graphic(c, encidx)) + break; - while (ptr < end - && (c = rb_enc_codepoint_len(ptr, end, &len, enc)) != 0 - && !is_graphic(c, encidx)) { ptr += len; count ++; - } + } - return LONG2NUM(count); + return LONG2NUM(idx + count); + } } /* * call-seq: * min_nonspace_index(string, index=0) -> int * - * Description + * Returns the smallest index (starting from the given index) that is not + * whitespace. If non-whitespace does not occur before the given index, then + * the length of the string is returned. + * + * s[min_nonspace_index(s)..-1] == s.lstrip + * s[min_nonspace_index(s, n)..-1] == s[n..-1].lstrip * * min_nonspace_index(" abc ") #=> 1 * min_nonspace_index(" abc ", 2) #=> 2 * min_nonspace_index(" abc ", 4) #=> 5 - * - * s[min_nonspace_index(s)..-1] == s.lstrip - * s[min_nonspace_index(s, n)..-1] == s[n..-1].lstrip + * min_nonspace_index("\r\n ") #=> 4 */ static VALUE rb_min_nonspace_index(int argc, const VALUE *argv, VALUE self) @@ -571,81 +621,144 @@ rb_min_nonspace_index(int argc, const VALUE *argv, VALUE self) rb_check_arity(argc, 1, 2); Check_Type(argv[0], T_STRING); - if (argc == 2) + if (argc >= 2) Check_Type(argv[1], T_FIXNUM); VALUE str; - int encidx; - long idx; - char *ptr, *end; - rb_encoding *enc; - str = argv[0]; + + char *ptr, *end; end = RSTRING_END(str); ptr = RSTRING_PTR(str); - idx = (argc == 1) ? 0 : FIX2LONG(argv[1]); + long idx; + idx = argc < 2 ? 0 : FIX2LONG(argv[1]); + + int encidx; encidx = ENCODING_GET(str); + + rb_encoding *enc; enc = rb_enc_from_index(encidx); - if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative"); - if (!ptr) return INT2FIX(0); + if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative"); + if (ptr == NULL) return INT2FIX(0); if (single_byte_optimizable(str, enc)) { - ptr += idx; + ptr += idx; /* address of str[idx] */ - if (ptr >= end) + if (end <= ptr) return LONG2NUM(RSTRING_LEN(str)); while (ptr < end && is_whitespace(*ptr, encidx)) ptr ++; return LONG2NUM(ptr - RSTRING_PTR(str)); - } + } else { + long len_, count; + len_ = 1; + count = 0; - int len; - long len_, count; - unsigned int c; + /* address of str[idx], len is .bytesize */ + ptr = rb_str_subpos(str, idx, &len_); + if (ptr == NULL) return rb_str_length(str); - len_ = 1; - count = 0; - ptr = rb_str_subpos(str, idx, &len_); + unsigned int c; + int len; - if (!ptr || ptr >= end) - return rb_str_length(str); + while (ptr < end) { + c = rb_enc_codepoint_len(ptr, end, &len, enc); + + if (!is_whitespace(c, encidx)) + break; - while (ptr < end - && (c = rb_enc_codepoint_len(ptr, end, &len, enc)) != 0 - && is_whitespace(c, encidx)) { ptr += len; count ++; - } + } - return LONG2NUM(count); + return LONG2NUM(idx + count); + } } /* * call-seq: * max_nonspace_index(string, index=0) -> int * - * Description + * Returns the largest index (starting from the given index) that is not + * whitespace. If non-whitespace does not occur before the given index, then + * the starting position is returned. + * + * s[0, max_nonspace_index(s)] == s.rstrip + * s[0, max_nonspace_index(s, n)] == s[0..n].rstrip * * max_nonspace_index(" abc ") #=> 3 * max_nonspace_index(" abc ", 2) #=> 2 * max_nonspace_index(" abc ", 0) #=> 0 - * - * s[0, max_nonspace_index(s)] == s.rstrip - * s[0, max_nonspace_index(s, n)] == s[0..n].rstrip + * max_nonspace_index(" abc ", 0) #=> 0 */ static VALUE -rb_max_nonspace_index() { - return Qnil; -} +rb_max_nonspace_index(int argc, VALUE *argv, VALUE self) +{ + rb_check_arity(argc, 1, 2); + + Check_Type(argv[0], T_STRING); + if (argc == 2) + Check_Type(argv[1], T_FIXNUM); + + VALUE str; + str = argv[0]; + + char *start, *end, *ptr; + start = RSTRING_PTR(str); + end = RSTRING_END(str); + + long idx; + idx = argc < 2 ? rb_str_strlen(str) : FIX2LONG(argv[1]); + int encidx; + encidx = ENCODING_GET(str); + + rb_encoding *enc; + enc = rb_enc_from_index(encidx); + + if (idx < 0) rb_raise(rb_eArgError, "index cannot be negative"); + if (start == NULL) return INT2FIX(0); + + if (single_byte_optimizable(str, enc)) { + ptr = start + idx; /* address of str[idx] */ + + if (end <= ptr) + ptr = end - 1; /* start at the last character */ + + while (start <= ptr && is_whitespace(*ptr, encidx)) + ptr --; + + return LONG2NUM(ptr - start); + } else { + long len, count; + len = 1; + count = 0; + + /* address of str[idx], len is .bytesize */ + ptr = rb_str_subpos(str, idx, &len); + if (ptr == NULL) return rb_str_length(str); + + while (ptr != NULL && ptr < end) { + unsigned int c = rb_enc_codepoint(ptr, end, enc); + + if (!is_whitespace(c, encidx)) + break; + + ptr = rb_enc_prev_char(start, ptr, end, enc); + count ++; + } + + return LONG2NUM(idx - count); + } +} void Init_native_ext(void) { for (int n = 0; n < 32; n++) - encset[n] = 0; + encdb[n] = 0; VALUE rb_mStupidedi = rb_define_module("Stupidedi"); VALUE rb_mReader = rb_define_module_under(rb_mStupidedi, "Reader"); diff --git a/lib/stupidedi/reader/input.rb b/lib/stupidedi/reader/input.rb index 9d3eade9d..d76c56722 100644 --- a/lib/stupidedi/reader/input.rb +++ b/lib/stupidedi/reader/input.rb @@ -58,7 +58,7 @@ def drop!(n) end # Calculates the position at the given offset. - # + # # @return [Position] def position_at(n) if @position.eql?(Position::NoPosition) @@ -125,7 +125,12 @@ def build(value, *args) elsif value.respond_to?(:read) position = args.last.delete(:position) if args.last.is_a?(Hash) path = value.path if value.respond_to?(:path) - new(Pointer.build(value.read), (position || Position::NoPosition).build(path)) + content = value.read + + # This will throw Encoding::InvalidByteSequenceErorr + content.encode("binary") unless content.valid_encoding? + + new(content, (position || Position::NoPosition).build(path)) else raise TypeError, @@ -146,20 +151,28 @@ def file(path, *args) args.last.delete(:position) end || Position::NoPosition - new(Pointer.build(File.read(path, *args)), position.build(path)) + content = File.read(path, *args) + + # This will throw Encoding::InvalidByteSequenceErorr + content.encode("binary") unless content.valid_encoding? + + new(Pointer.build(content), position.build(path)) end # @example # Input.string(io.read) # Input.string("...", position: Position::OffsetPosition) # - def string(value, *args) + def string(content, *args) position = if args.last.is_a?(Hash) args.last.delete(:position) end || Position::NoPosition - new(Pointer.build(value), position.build(nil)) + # This will throw Encoding::InvalidByteSequenceErorr + content.encode("binary") unless content.valid_encoding? + + new(Pointer.build(content), position.build(nil)) end # @endgroup diff --git a/lib/stupidedi/reader/pointer.rb b/lib/stupidedi/reader/pointer.rb index 882b71ef4..b10a6cf0d 100644 --- a/lib/stupidedi/reader/pointer.rb +++ b/lib/stupidedi/reader/pointer.rb @@ -327,6 +327,14 @@ class << Pointer # @group Constructors ######################################################################### + # Constructs a new Pointer depending on what type of object is passed. + # + # NOTE: Pointer operations can potentially destrucively modify the given + # object, but if it is `#frozen?`, a copy will be made before the update. + # If you are accessing or modifying the object outside of the Pointer API + # unexpected results might occur. To avoid this, either provide a copy + # with `#dup` or freeze the object first with `#freeze`. + # def build(object) case object when String diff --git a/lib/stupidedi/reader/string_ptr.rb b/lib/stupidedi/reader/string_ptr.rb index 4e837091b..56c06d56f 100644 --- a/lib/stupidedi/reader/string_ptr.rb +++ b/lib/stupidedi/reader/string_ptr.rb @@ -8,7 +8,7 @@ class StringPtr < Pointer # @group Conversion Methods ######################################################################### - def_delegators :@storage, :encoding, type: String + def_delegators :@storage, :encoding, :valid_encoding?, type: String def_delegators :reify, :to_sym, :intern, :to_i, type: String def_delegators :reify, :to_d @@ -412,35 +412,53 @@ def +(other) # @group Formatting ######################################################################### - # Returns a new StringPtr with trailing spaces removed; "\000", "\t", - # "\n", "\v", "\f", "\r", " " + # Returns a new StringPtr with leading whitespace removed. # # @return [StringPtr] - def rstrip(offset = @length - 1) - raise ArgumentError, "offset must be non-negative" if offset < 0 - offset = @length if offset > @length - offset_ = NativeExt.max_nonspace_index(@storage, @offset + offset) + def lstrip(start_at = 0) + raise ArgumentError, "start_at must be non-negative" if start_at < 0 + start_at = @length - 1 if start_at >= @length + index = NativeExt.min_nonspace_index(@storage, @offset + start_at) - length = (offset_ || @offset - 1) - @offset + 1 - take(length) + if index <= @offset + self + else + # 01234o--> + # ----[xxxxx ]-------- + # + # In this picture, min_nonspace_index(o=5) would return an index + # that's past the end of this substring, but `drop` will prevent + # any problem because it does a boundary check. + drop(index - @offset) + end end - # Returns a new StringPtr with leading spaces removed; "\000", "\t", - # "\n", "\v", "\f", "\r", " " + # Returns a new StringPtr with trailing whitespace removed. # # @return [StringPtr] - def lstrip(offset = 0) - raise ArgumentError, "offset must be non-negative" if offset < 0 - offset = @length if offset > @length - offset_ = NativeExt.min_nonspace_index(@storage, @offset + offset) + def rstrip(start_at = @length - 1) + raise ArgumentError, "start_at must be non-negative" if start_at < 0 + start_at = @length - 1 if start_at >= @length + index = NativeExt.max_nonspace_index(@storage, @offset + start_at) - if offset_ > @offset + @length + if index >= @offset + @length - 1 self else - drop(offset_ - @offset) + # <---o3456 + # ----[ xxxx]-------- + # + # In this picture, max_nonspace_index(o=2) would return an index + # that's before the start of this substring, which would result + # in take(n < 0) which throws an exception. This prevents that: + index = @offset - 1 if index < @offset + take(index + 1 - @offset) end end + def strip + lstrip.rstrip + end + # @group Miscellaneous ######################################################################### diff --git a/lib/stupidedi/reader/tokenizer.rb b/lib/stupidedi/reader/tokenizer.rb index 009f0ac3d..b14dec25c 100644 --- a/lib/stupidedi/reader/tokenizer.rb +++ b/lib/stupidedi/reader/tokenizer.rb @@ -21,6 +21,9 @@ class Tokenizer # @return [SegmentDict] attr_accessor :segment_dict + VALID_SEPARATOR = /[^[:alnum:] ]/u + VALID_SEGMENT_ID = /\A[A-Z][A-Z0-9]{1,2}\Z/u + # @param input [Input] def initialize(input, separators, segment_dict, switcher, switcher_) # Make a separate switch for _read_component_element so building a @@ -31,12 +34,6 @@ def initialize(input, separators, segment_dict, switcher, switcher_) @i = "I".encode(@input.encoding).freeze @s = "S".encode(@input.encoding).freeze @a = "A".encode(@input.encoding).freeze - - #@bad_separator = /[a-zA-Z0-9 ]/.freeze # TODO: encode - #@segment_id = /\A[A-Z][A-Z0-9]{1,2}\Z/.freeze # TODO: encode - - @bad_separator = /[[:alnum:][:space]]/u - @segment_id = /\A[A-Z][A-Z0-9]{1,2}\Z/u end # @yield [Tokens::SegmentTok | Tokens::IgnoredTok] @@ -180,38 +177,38 @@ def _next_isa_segment_id(input) i = input.index(@i, offset) # There's no I in the rest of the input, so it's all ignored - return eof("ISA", input.position_at(offset)) if i.nil? + #return eof("ISA", input.position_at(offset)) if i.nil? + return eof("ISA", input.position) if i.nil? # In the next iteration, search for "I" begins right after this one offset = i+1 - s = input.index(@s, i+1) - - # There's no S in the rest of the input, so it's all ignored - return eof("ISA", input.position_at(i)) if s.nil? + # Skip to the next character + s = input.min_graphic_index(i+1) - # There's something between I..S but it's not a control character - next if s > i+1 and input[i+1, s-i-1].min_graphic_index > i+1 + #return eof("ISA", input.position_at(i)) unless input.defined_at?(s) + return eof("ISA", input.position) unless input.defined_at?(s) - a = input.index(@a, s+1) + # The character after this "I" is not "S" + next unless input[s, 1] == @s - # There's no A in the rest of the input, so it's all ignored - return eof("ISA", input.position_at(i)) if a.nil? + a = input.min_graphic_index(s+1) - # There's something between S..A but it's not a control character - next if a > s+1 and input[s+1, a-s-1].min_graphic_index > s+1 + #return eof("ISA", input.position_at(i)) unless input.defined_at?(a) + return eof("ISA", input.position) unless input.defined_at?(a) - # Needed to perform the extra validation below - a = input.min_graphic_index(a) + # The character after this "S" is not "A" + next unless input[a, 1] == @a # The next character determines the element separator. If it's an # alphanumeric or space, we assume this is not the start of an ISA # segment. Perhaps a word like "L[ISA] " or "D[ISA]RRAY" - next if not input.defined_at?(a+1) or input.at(a+1).match?(@bad_separator) + next unless input.defined_at?(a+1) and input[a+1].match?(VALID_SEPARATOR) # Success, ignore everything before "I", resume parsing after "A". yield Tokens::IgnoredTok.new(input.take(i), input.position) if block_given? + # First character of input will be the element separator return done(:ISA, input.position_at(i), input.drop!(a+1)) end @@ -235,21 +232,22 @@ def _read_isa_elements(input) element.value end - # We have to assume the last (16th) element is fixed-length because - # it is not terminated by an element separator. First we will skip - # past control characters, then read the next character. - offset = input.min_graphic_index(1) - return eof("ISA16", input.position) unless input.defined_at?(offset) - element_toks << Tokens::SimpleElementTok.build(input.at(offset), input.position_at(offset)) - - # The character immediately after ISA16 is defined to be the - # segment terminator. The separator could be a control character, - # e.g. \n, because we do not skip past them here. - return eof("segment terminator for ISA", input.position_at(offset)) \ - unless input.defined_at?(offset + 1) - - @separators.segment = input.at(offset + 1) - done(element_toks, nil, input.drop!(offset + 2)) + # The next two characters define ISA16 (this is the component separator, + # at least as early as version 00304) and the segment terminator. Both + # can have control characters as values, so we don't skip ahead here. + us = input.at(1) + return eof("ISA16", input.position_at(1)) if us.nil? + return expected("component separator in ISA16, found %s" % us.inspect, + input.position_at(1)) unless us.match?(VALID_SEPARATOR) + element_toks << Tokens::SimpleElementTok.build(input.at(1), input.position) + + tr = input.at(2) + return eof("segment terminator for ISA", input.position_at(2)) if tr.nil? + return expected("segment terminator after ISA16, found %s" % tr.inspect, + input.position_at(2)) unless tr.match?(VALID_SEPARATOR) + + @separators.segment = tr + done(element_toks, nil, input.drop!(3)) end # Works similarly to `_next_isa_segment_id`, except the result is the @@ -292,9 +290,9 @@ def _next_segment_id(input) segment_id = buffer.to_s return expected("segment identifier, found %s" % segment_id.inspect, - start_pos) unless segment_id.match?(@segment_id) + start_pos) unless segment_id.match?(VALID_SEGMENT_ID) - return done(segment_id.to_sym, start_pos, input.lstrip_nongraphic(offset)) + return done(segment_id.to_sym, start_pos, input.drop(offset))#.lstrip_nongraphic(offset)# end # @param input should be positioned on an element separator: "NM1[*].." diff --git a/lib/stupidedi/ruby/blank.rb b/lib/stupidedi/ruby/blank.rb index 9d43e518e..1d7dda362 100644 --- a/lib/stupidedi/ruby/blank.rb +++ b/lib/stupidedi/ruby/blank.rb @@ -44,11 +44,11 @@ def present? # 100.blank? #=> false # def blank? - false + respond_to?(:empty?) and empty? end def present? - true + not blank? end end end diff --git a/lib/stupidedi/ruby/object.rb b/lib/stupidedi/ruby/object.rb index 86874b9ab..bf38e7802 100644 --- a/lib/stupidedi/ruby/object.rb +++ b/lib/stupidedi/ruby/object.rb @@ -38,8 +38,10 @@ def snoc(array = []) # nil.bind{|a| a.nil? } #=> true # 100.bind{|a| a.nil? } #=> false # - def bind - yield self + unless nil.respond_to?(:then) + def then + yield self + end end # @endgroup diff --git a/lib/stupidedi/values/composite_element_val.rb b/lib/stupidedi/values/composite_element_val.rb index 47ccfd8b3..bc3d65a0f 100644 --- a/lib/stupidedi/values/composite_element_val.rb +++ b/lib/stupidedi/values/composite_element_val.rb @@ -62,8 +62,8 @@ def element(n) # @return [void] # :nocov: def pretty_print(q) - id = definition.bind do |d| - "[#{d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/an.rb b/lib/stupidedi/versions/common/element_types/an.rb index 5a7d106cb..bca00a1d5 100644 --- a/lib/stupidedi/versions/common/element_types/an.rb +++ b/lib/stupidedi/versions/common/element_types/an.rb @@ -171,8 +171,8 @@ def map # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -270,8 +270,8 @@ def value # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -322,8 +322,8 @@ def to_x12(truncate = true) # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/dt.rb b/lib/stupidedi/versions/common/element_types/dt.rb index c223f91e6..1870edf25 100644 --- a/lib/stupidedi/versions/common/element_types/dt.rb +++ b/lib/stupidedi/versions/common/element_types/dt.rb @@ -80,8 +80,8 @@ def map # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -161,8 +161,8 @@ def empty? # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -280,8 +280,8 @@ def past # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -473,8 +473,8 @@ def future # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/id.rb b/lib/stupidedi/versions/common/element_types/id.rb index 708c950ff..1ddb6ecf9 100644 --- a/lib/stupidedi/versions/common/element_types/id.rb +++ b/lib/stupidedi/versions/common/element_types/id.rb @@ -95,8 +95,8 @@ def map # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -193,8 +193,8 @@ def value # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -241,8 +241,8 @@ def to_x12(truncate = true) # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/nn.rb b/lib/stupidedi/versions/common/element_types/nn.rb index f54505410..a6641e2ae 100644 --- a/lib/stupidedi/versions/common/element_types/nn.rb +++ b/lib/stupidedi/versions/common/element_types/nn.rb @@ -78,8 +78,8 @@ def map # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -159,8 +159,8 @@ def empty? # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -230,8 +230,8 @@ def too_long? # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/r.rb b/lib/stupidedi/versions/common/element_types/r.rb index 4d03ff60a..2aa096593 100644 --- a/lib/stupidedi/versions/common/element_types/r.rb +++ b/lib/stupidedi/versions/common/element_types/r.rb @@ -79,7 +79,7 @@ def map # :nocov: def inspect id = definition.try do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -157,7 +157,7 @@ def empty? # :nocov: def inspect id = definition.try do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -217,7 +217,7 @@ def empty? # :nocov: def inspect id = definition.try do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/versions/common/element_types/tm.rb b/lib/stupidedi/versions/common/element_types/tm.rb index 10d470b0e..4678eeaf3 100644 --- a/lib/stupidedi/versions/common/element_types/tm.rb +++ b/lib/stupidedi/versions/common/element_types/tm.rb @@ -66,8 +66,8 @@ def map # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -142,8 +142,8 @@ def empty? # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? @@ -235,8 +235,8 @@ def to_time(date, minute = nil, second = nil) # @return [String] # :nocov: def inspect - id = definition.bind do |d| - "[#{"% 5s" % d.id}: #{d.name}]".bind do |s| + id = definition.then do |d| + "[#{"% 5s" % d.id}: #{d.name}]".then do |s| if usage.forbidden? ansi.forbidden(s) elsif usage.required? diff --git a/lib/stupidedi/writer/default.rb b/lib/stupidedi/writer/default.rb index d7e5eace8..61acb5674 100644 --- a/lib/stupidedi/writer/default.rb +++ b/lib/stupidedi/writer/default.rb @@ -17,7 +17,7 @@ def write(out = "") common = @separators.characters & @zipper.node.characters message = common.to_a.map(&:inspect).join(", ") - unless common.empty? + if common.present? raise Exceptions::OutputError, "separator characters #{message} occur as data" end @@ -51,10 +51,13 @@ def recurse(value, separators, out) separators = value.separators raise Exceptions::OutputError, - "separators.segment cannot be blank" if separators.segment.empty? + "separators.segment cannot be blank" if separators.segment.nil? or separators.segment.empty? raise Exceptions::OutputError, - "separators.element cannot be blank" if separators.element.empty? + "separators.element cannot be blank" if separators.element.nil? or separators.element.empty? + + raise Exceptions::OutputError, + "separators.component cannot be blank" if separators.component.nil? or separators.component.empty? end value.children.each{|c| recurse(c, separators, out) } diff --git a/spec/fixtures/004010/QM214/pass/gh-189.edi b/spec/fixtures/004010/QM214/pass/gh-189.edi new file mode 100644 index 000000000..edf4d44dc --- /dev/null +++ b/spec/fixtures/004010/QM214/pass/gh-189.edi @@ -0,0 +1,4 @@ +ISA00 00 ZZ0607047800010GBZZBLIT_T 1905072217U004010000008931T +GSQMFEDEXBLIT201905072217914X004010 +GE1914 +IEA1000000893 diff --git a/spec/lib/stupidedi/reader/pointer_spec.rb b/spec/lib/stupidedi/reader/pointer_spec.rb index bdebbbda1..9cff9d992 100644 --- a/spec/lib/stupidedi/reader/pointer_spec.rb +++ b/spec/lib/stupidedi/reader/pointer_spec.rb @@ -1,4 +1,5 @@ describe Stupidedi::Reader::Pointer do + using Stupidedi::Refinements let(:empty) { Stupidedi::Reader::Pointer.build([]) } let(:three) { Stupidedi::Reader::Pointer.build(%w(a b c)) } @@ -17,12 +18,14 @@ expect(result).to eq(%w(a b c)) expect(result).to be_a(Array) expect(result).to equal(three.storage) + expect(ignore.storage).to equal(three.storage) end it "returns a copy when asked" do ignore = shared result = three.reify(true) expect(result).to_not equal(three.storage) + expect(ignore.storage).to equal(three.storage) end it "returns a copy when needed" do @@ -154,11 +157,11 @@ specify { expect(three[0...0]).to eq([]) } specify { expect(three[0..0]).to eq(%w(a)) } specify { expect(three[1..2]).to eq(%w(b c)) } - specify { expect(three[1..]).to eq(%w(b c)) } - specify { expect(three[1...]).to eq(%w(b c)) } + specify { expect(three[1..-1]).to eq(%w(b c)) } + specify { expect(three[1...-1]).to eq(%w(b)) } specify { expect(three[-3..-2]).to eq(%w(a b)) } - specify { expect(three[-4..]).to be_nil } - specify { expect(three[4..]).to be_nil } + specify { expect(three[-4..-1]).to be_nil } + specify { expect(three[4..-1]).to be_nil } specify { expect(three[2..9]).to eq(%w(c)) } end diff --git a/spec/lib/stupidedi/reader/string_ptr_spec.rb b/spec/lib/stupidedi/reader/string_ptr_spec.rb index d0ed0cbaf..abe4d3fac 100644 --- a/spec/lib/stupidedi/reader/string_ptr_spec.rb +++ b/spec/lib/stupidedi/reader/string_ptr_spec.rb @@ -1,11 +1,20 @@ # frozen_string_literal: true # encoding: utf-8 describe Stupidedi::Reader::StringPtr do + + def pointer(s) + Stupidedi::Reader::Pointer.build(s) + end + let(:lower) { "abcdefghijklmnopqrstuvwxyz abcdefghijklmnOPQRSTUVWXYZ" } let(:upper) { "ABCDEFGHIJKLMNOPQRSTUVWXYZ ABCDEFGHIJKLMNopqrstuvwxyz" } - let(:lower_ptr) { Stupidedi::Reader::Pointer.build(lower.dup) } - let(:upper_ptr) { Stupidedi::Reader::Pointer.build(upper.dup) } + # NOTE: Ruby 2.2 and lower don't support frozen string literals. This is + # important for StringPtr, because if StringPtr#storage is not frozen, it + # is assumed destructive updates are safe. However, some tests below will + # fail due to destructive updates. + let(:upper_ptr) { pointer(upper.freeze) } + let(:lower_ptr) { pointer(lower.freeze) } describe "#to_s" do it "is called implicitly" do @@ -82,25 +91,25 @@ it "is zero-copy when possible" do result = lower_ptr.drop(3).take(0) << "defghi" expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end it "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) << "ghi" expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end it "is zero-copy when possible" do result = lower_ptr << upper_ptr.take(3) expect(result).to eq(lower + "ABC") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end it "allocates new string otherwise" do result = lower_ptr.drop(3).take(3) << "xyz" expect(result).to eq("defxyz") - expect(result.storage).to_not eql(lower_ptr.storage) + expect(result.storage).to_not equal(lower_ptr.storage) end end @@ -108,19 +117,19 @@ it "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) << lower_ptr.drop(6).take(3) expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end todo "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) << lower_ptr.drop(33).take(3) expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end todo "is zero-copy when possible" do result = lower_ptr.drop(15).take(3) << upper_ptr.drop(45).take(3) expect(result).to eq("pqrstu") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end end end @@ -130,13 +139,13 @@ it "is zero-copy when possible" do result = lower_ptr.drop(3).take(0) + "defghi" expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end it "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) + "ghi" expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end it "is zero-copy when possible" do @@ -156,19 +165,19 @@ it "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) + lower_ptr.drop(6).take(3) expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end todo "is zero-copy when possible" do result = lower_ptr.drop(3).take(3) + lower_ptr.drop(33).take(3) expect(result).to eq("defghi") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end todo "is zero-copy when possible" do result = lower_ptr.drop(15).take(3) + upper_ptr.drop(45).take(3) expect(result).to eq("pqrstu") - expect(result.storage).to eql(lower_ptr.storage) + expect(result.storage).to equal(lower_ptr.storage) end end end @@ -307,17 +316,17 @@ context "when string doesn't end with whitespace" do it "returns self" do - expect(lower_ptr.rstrip).to eql(lower_ptr) + expect(lower_ptr.rstrip).to equal(lower_ptr) end end context "when string ends with whitespace" do it "is zero-copy" do expect(sb.rstrip).to eq(" abc") - expect(sb.rstrip.storage).to eql(sb.storage) + expect(sb.rstrip.storage).to eq(sb.storage) expect(mb.rstrip).to eq(" πŸ’ƒπŸ½πŸ•ΊπŸ»") - expect(mb.rstrip.storage).to eql(mb.storage) + expect(mb.rstrip.storage).to eq(mb.storage) end end end @@ -326,20 +335,48 @@ let(:sb) { Stupidedi::Reader::Pointer.build(" abc ") } let(:mb) { Stupidedi::Reader::Pointer.build(" πŸ’ƒπŸ½πŸ•ΊπŸ» ") } - context "when string doesn't end with whitespace" do + context "when string doesn't begin with whitespace" do it "returns self" do - expect(lower_ptr.rstrip).to eql(lower_ptr) + expect(lower_ptr.lstrip).to equal(lower_ptr) end end context "when string ends with whitespace" do it "is zero-copy" do expect(sb.lstrip).to eq("abc ") - expect(sb.lstrip.storage).to eql(sb.storage) + expect(sb.lstrip.storage).to equal(sb.storage) expect(mb.lstrip).to eq("πŸ’ƒπŸ½πŸ•ΊπŸ» ") - expect(mb.lstrip.storage).to eql(mb.storage) + expect(mb.lstrip.storage).to equal(mb.storage) end end end + + describe "#min_graphic_index" do + context "when string begins with a graphic character" do + specify { expect(pointer(" abc ").min_graphic_index(0)).to eq(0) } + specify { expect(pointer(" abc ").min_graphic_index(1)).to eq(1) } + specify { expect(pointer(" πŸ’ƒπŸ½πŸ•ΊπŸ» ").min_graphic_index(0)).to eq(0) } + specify { expect(pointer(" πŸ’ƒπŸ½πŸ•ΊπŸ» ").min_graphic_index(1)).to eq(1) } + end + + context "when string doesn't begin with a graphic character" do + specify { expect(pointer("\r\nabc ").min_graphic_index(0)).to eq(2) } + specify { expect(pointer("\r\nabc ").min_graphic_index(1)).to eq(2) } + specify { expect(pointer("\r\n πŸ’ƒπŸ½πŸ•ΊπŸ» ").min_graphic_index(0)).to eq(2) } + specify { expect(pointer("\r\n πŸ’ƒπŸ½πŸ•ΊπŸ» ").min_graphic_index(1)).to eq(2) } + end + + context "when string doesn't contain a graphic character" do + specify { expect(pointer("").min_graphic_index(0)).to eq(0) } + specify { expect(pointer("\r\n").min_graphic_index(0)).to eq(2) } + specify { expect(pointer("abc\r\n").min_graphic_index(3)).to eq(5) } + end + end + + describe "#min_whitespace_index" do + end + + describe "#max_whitespace_index" do + end end diff --git a/spec/lib/stupidedi/reader/tokenizer_spec.rb b/spec/lib/stupidedi/reader/tokenizer_spec.rb index 80c7c67bb..4d85ad175 100644 --- a/spec/lib/stupidedi/reader/tokenizer_spec.rb +++ b/spec/lib/stupidedi/reader/tokenizer_spec.rb @@ -34,6 +34,7 @@ it "yields each ISA segment and skips everything between" do ignored = Stupidedi::Tokens::IgnoredTok segment = Stupidedi::Tokens::SegmentTok + expect(tokens.map(&:class)).to eq([ignored, segment, ignored, segment, ignored, segment]) diff --git a/spec/lib/stupidedi/ruby/array_spec.rb b/spec/lib/stupidedi/ruby/array_spec.rb index ee10a146e..979554d33 100644 --- a/spec/lib/stupidedi/ruby/array_spec.rb +++ b/spec/lib/stupidedi/ruby/array_spec.rb @@ -4,7 +4,7 @@ # Creates an array with length `2*n-1` containing elements # 1, 2, 3, ..., n-1, n, n-1, ..., 3, 2, 1 def mkarray(n) - (1..n).to_a.bind{|as| as.concat(as.init.reverse) } + (1..n).to_a.then{|as| as.concat(as.init.reverse) } end describe "#sum" do @@ -23,7 +23,7 @@ def mkarray(n) end context "with a block" do - it "is same as element.bind" do + it "is same as element.then" do expect{|b| [-10].sum(&b) }.to yield_with_args(-10) expect([-10].sum(&:abs)).to eq(10) end diff --git a/spec/lib/stupidedi/ruby/object_spec.rb b/spec/lib/stupidedi/ruby/object_spec.rb index 76cb085fe..639669d18 100644 --- a/spec/lib/stupidedi/ruby/object_spec.rb +++ b/spec/lib/stupidedi/ruby/object_spec.rb @@ -49,15 +49,15 @@ describe "Object#bind" do it "provides an alternative syntax to local variables and parethesized expressions" do - expect(1.bind{|a| a + 2 } * 3).to be == ((1 + 2) * 3) + expect(1.then{|a| a + 2 } * 3).to be == ((1 + 2) * 3) end it "provides an alternative syntax to local variables and parethesized expressions" do - expect(1.bind{|a| (a + 2).bind{|b| b * 3 }}).to be == ((1 + 2) * 3) + expect(1.then{|a| (a + 2).then{|b| b * 3 }}).to be == ((1 + 2) * 3) end it "provides an alternative syntax to local variables and parethesized expressions" do - expect(1.bind{|a| a.bind{|b| b + 2 }.bind{|c| c * 3 }}).to be == ((1 + 2) * 3) + expect(1.then{|a| a.then{|b| b + 2 }.then{|c| c * 3 }}).to be == ((1 + 2) * 3) end end diff --git a/spec/lib/stupidedi/transaction_sets/005010/implementations/X221-HP835_spec.rb b/spec/lib/stupidedi/transaction_sets/005010/implementations/X221-HP835_spec.rb index bca00afe9..04ca33c31 100644 --- a/spec/lib/stupidedi/transaction_sets/005010/implementations/X221-HP835_spec.rb +++ b/spec/lib/stupidedi/transaction_sets/005010/implementations/X221-HP835_spec.rb @@ -160,7 +160,7 @@ context "with issues in Table 3 (Summary)" do context "with missing PLB" do - let(:parser) { Fixtures.parse!("#{fixdir}/2.edi").head } + let(:parser) { Fixtures.parse!("#{fixdir}/2.edi", encoding: "ISO-8859-1").head } it "is handled" do expect(parser).to be_deterministic diff --git a/spec/lib/stupidedi/transaction_sets_spec.rb b/spec/lib/stupidedi/transaction_sets_spec.rb index c64f74c33..8a44a9901 100644 --- a/spec/lib/stupidedi/transaction_sets_spec.rb +++ b/spec/lib/stupidedi/transaction_sets_spec.rb @@ -3,9 +3,7 @@ Fixtures.failing + Fixtures.skipping).group_by{|_, name, _| name }] - checked = Set.new - - Definitions.transaction_set_defs.each do |name, value, error| + def self.mk_fixture_spec(name, value, error, fixtures, checked = Set.new) describe name.split("::").slice(2..-1).join("::") do it "is well-defined", :schema do expect(Object.const_get(name)).to be_a(Stupidedi::Schema::TransactionSetDef) @@ -49,7 +47,7 @@ when %r{/pass/} it "can parse '#{path}'", :fixtures do expect(lambda do - machine, = Fixtures.parse!(path, config) + machine, = Fixtures.parse!(path, config: config) builder = Stupidedi::Parser::BuilderDsl.new(nil) machine.__send__(:roots).each do |z| builder.__send__(:critique, z.node.zipper, "", true) @@ -59,7 +57,7 @@ when %r{/skip/} pending "can parse '#{path}'", :fixtures do expect(lambda do - machine, = Fixtures.parse!(path, config) + machine, = Fixtures.parse!(path, config: config) builder = Stupidedi::Parser::BuilderDsl.new(nil) machine.__send__(:roots).each do |z| builder.__send__(:critique, z.node.zipper, "", true) @@ -69,7 +67,7 @@ when %r{/fail/} it "cannot parse '#{path}'", :fixtures do expect(lambda do - machine, = Fixtures.parse!(path, config) + machine, = Fixtures.parse!(path, config: config) builder = Stupidedi::Parser::BuilderDsl.new(nil) machine.__send__(:roots).each do |z| builder.__send__(:critique, z.node.zipper, "", true) @@ -84,13 +82,17 @@ it "can parse examples", :fixtures do parts = name.split("::").slice(2..-1) version = Fixtures.versions.invert.fetch(parts[0], parts[0]) - name = parts[2..3].join(" ") + _name = parts[2..3].join(" ") - pending "No fixtures were found in 'spec/fixtures/#{version}/#{name}/{pass,fail}'" + pending "No fixtures were found in 'spec/fixtures/#{version}/#{_name}/{pass,fail}'" fail end end end end + Definitions.transaction_set_defs.each do |name, value, error| + mk_fixture_spec(name.dup, value, error, fixtures) + end + end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 73f04491e..003c7f6ef 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,4 +1,12 @@ +# NOTE: If you want change RSpec options for your own use, please create .rspec +# and add CLI options to it. That file is in .gitignore, so it will remain out +# of the repository. For starters, you probably want to add: +# +# --require spec_helper +# --require stupidedi + Bundler.setup(:default, :development, :test) + begin require "simplecov" rescue LoadError @@ -19,31 +27,7 @@ c.syntax = :expect end - # Use either of these to run only specs marked 'focus: true' - # config.filter_run(:focus => true) - # $ rspec -r spec_helper -t focus - - # Use either of these to run only randomized specs: - # config.filter_run(:random => true) - # $ rspec -r spec_helper -t random - - # When a randomized test fails, it will print a seed value you - # can use to repeat the test with the same generated inputs: - # srand 44182052595481443184625304627718313206 - - # Use either of these to skip running randomized specs: - # config.filter_run_excluding(:random => true) - # $ rspec -r spec_helper -t ~random - # Skip platform-specific examples unless our platform matches (exclude non-matches) config.filter_run_excluding(ruby: lambda{|expected| /^#{expected}/ !~ RUBY_VERSION }) config.filter_run_excluding(:skip) - #onfig.filter_run_including(:focus) - - config.profile_examples = true - #onfig.fail_fast = true - config.fail_if_no_examples = true - - # https://relishapp.com/rspec/rspec-core/v/3-8/docs/configuration/custom-deprecation-stream - # https://relishapp.com/rspec/rspec-core/v/3-8/docs/configuration/overriding-global-ordering end diff --git a/spec/support/definitions.rb b/spec/support/definitions.rb index a9ba539cf..64368b417 100644 --- a/spec/support/definitions.rb +++ b/spec/support/definitions.rb @@ -271,7 +271,7 @@ def REP; SegmentDefs::REP end def ANA(*args) AN_(SegmentDefs::ANA, *args) end def ANB(*args) AN_(SegmentDefs::ANB, *args) end def AN_(segment_def, changes = {}) - segment_def.bind do |s| + segment_def.then do |s| return s if changes.empty? s.copy(:element_uses => s.element_uses.map{|u| u.copy(:definition => u.definition.copy(changes))}) end @@ -281,11 +281,11 @@ def IDA(*args) ID_(SegmentDefs::IDA, *args) end def IDB(*args) ID_(SegmentDefs::IDB, *args) end def IDC(*args) ID_(SegmentDefs::IDC, *args) end def ID_(segment_def, allowed_values = [], name = nil) - segment_def.bind do |s| + segment_def.then do |s| return s if allowed_values.empty? and name.nil? s.copy( :name => name || s.name, - :element_uses => s.element_uses.head.bind do |u| + :element_uses => s.element_uses.head.then do |u| u.copy(:allowed_values => u.allowed_values.replace(allowed_values)) end.cons(s.element_uses.tail)) end @@ -295,13 +295,13 @@ def COI(*args) CO_(SegmentDefs::COI, *args) end def COJ(*args) CO_(SegmentDefs::COJ, *args) end def COK(*args) CO_(SegmentDefs::COK, *args) end def CO_(segment_def, allowed_values = [], name = nil) - segment_def.bind do |s| + segment_def.then do |s| return s if allowed_values.empty? and name.nil? s.copy( :name => name || s.name, - :element_uses => s.element_uses.head.bind do |u| - u.copy(:definition => u.definition.bind do |c| - c.copy(:component_uses => c.component_uses.head.bind do |x| + :element_uses => s.element_uses.head.then do |u| + u.copy(:definition => u.definition.then do |c| + c.copy(:component_uses => c.component_uses.head.then do |x| x.copy(:allowed_values => x.allowed_values.replace(allowed_values)) end.cons(c.component_uses.tail)) end) diff --git a/spec/support/fixtures.rb b/spec/support/fixtures.rb index 07716fc59..ee12501b9 100644 --- a/spec/support/fixtures.rb +++ b/spec/support/fixtures.rb @@ -42,8 +42,8 @@ def filepath(path) end # @return [String] - def read(path) - filepath(path).open("rb", &:read) + def read(path, *args) + filepath(path).open(*args, &:read) end def position @@ -51,7 +51,7 @@ def position end # @return [Stupidedi::Parser::StateMachine, Stupidedi::Reader::Result] - def parse(path, config = nil) + def parse(path, config:nil, encoding:nil) if path.is_a?(String) path = Pathname.new(path) end @@ -60,13 +60,13 @@ def parse(path, config = nil) _, config, _ = mkconfig(*parts(path)) end - tokenizer = Stupidedi::Reader.build(filepath(path), position: position) + tokenizer = Stupidedi::Reader.build(filepath(path), position: position, encoding: encoding) Stupidedi::Parser.build(config).read(tokenizer) end # @return [Stupidedi::Parser::StateMachine, Stupidedi::Reader::Result] - def parse!(path, config = nil) - machine, result = parse(path, config) + def parse!(path, *args) + machine, result = parse(path, *args) if result.fatal? result.explain{|msg| raise Stupidedi::Exceptions::ParseError, "#{msg} at #{result.position.inspect}" } diff --git a/spec/support/matchers/navigation_matchers.rb b/spec/support/matchers/navigation_matchers.rb index 32aaec105..91dd8df87 100644 --- a/spec/support/matchers/navigation_matchers.rb +++ b/spec/support/matchers/navigation_matchers.rb @@ -27,7 +27,7 @@ def description def matches?(value) @filter_tok, @syntax_val = extract_arguments(value) - matches = @syntax_val.segment? + matches = @syntax_val.segment? matches &&= !(@segment_id.present? and filter?(@filter_tok, @syntax_val)) end diff --git a/spec/support/quickcheck.rb b/spec/support/quickcheck.rb index 564851e42..1a8998f8a 100644 --- a/spec/support/quickcheck.rb +++ b/spec/support/quickcheck.rb @@ -181,7 +181,7 @@ def between(lo, hi = nil) rand(hi + 1 - lo) + lo when Range # @todo: #to_a is wasteful for large Ranges - lo.to_a.bind{|a| a[between(0, a.length - 1)] } + lo.to_a.then{|a| a[between(0, a.length - 1)] } end end