# encode / decode Mojibake-ed string class String def u self.dup.force_encoding('utf-8') end end # Mojibake class Mojibake attr_reader :hint def self.encode(src) self.new.encode(src) end def encode(src) src.force_encoding('cp932') .encode('utf-8', undef: :replace, invalid: :replace, replace: "\0") .gsub(/\0+/, '・'.u) end def decode(src) d = Decoder.new(src) r = d.decode @hint = d.hint r end def code_to_char(src) src.b.gsub(/\\x([0-9a-f][0-9a-f])/i){$1.to_i(16).chr}.u .scrub{|c| c.bytes.map{'\\x'+_1.ord.to_s(16).upcase}.join} end # Decoder class Decoder H = -'\\\\x[0-9A-F][0-9A-F]' def self.utf8_code return @utf8 if @utf8 # cp932 = {} utf8 = {} ((0x81..0x9F).to_a+(0xE0..0xFF).to_a).each do |c1| ((0x40..0x7E).to_a+(0x80..0xFC).to_a).each do |c2| s = [c1, c2].pack('c*').force_encoding('cp932') u = s.encode('utf-8', 'cp932') next if u.ord >= 0xe000 && u.ord <= 0xf8ff # 私的領域 # cp932[s.b] = s.encode('utf-8') utf8[u.b] = u rescue EncodingError # ignore end end @utf8 = utf8 end self.utf8_code attr_reader :hint def initialize(src) @src = src end def decode a = phase1(@src) a = phase2(a) a = phase3(a) phase4(a) end # 文字列を文字の配列に分割する # @param src [String] # @return [Array] def phase1(src) s = src.u.tr('・'.u, "\0").encode('cp932', undef: :replace, invalid: :replace, replace: "\0").u chars = s.chars a = [] while (c = chars.shift) if !c.valid_encoding? && a.last && !a.last.valid_encoding? && !utf8_first?(c) a.last.concat c elsif c == "\0" && a.last && !a.last.valid_encoding? && chars[0] && !chars[0].valid_encoding? && !utf8_first?(chars[0]) a.last.concat c a.last.concat chars.shift else a.push c end end b = [] while (c = a.shift) if c != "\0" && c.b =~ /\0/ && candidate(c).empty? b.concat c.b.split(/\0/).map(&:u) else b.push c end end b.delete("\0") b end def utf8_first?(c) c.b.ord >= 0xc2 end # 不完全な文字の候補を作成する # 候補が1つだけならそれを採用する # @param src [Array] # @return [Array>] def phase2(src) src.map do |c| next c if c.valid_encoding? a = candidate(c) if a.empty? c elsif a.size == 1 a[0] else a end end end # 候補の文字を元の文字化けが再現するものに絞り込む # @param src [Array>] # @return [Array>] def phase3(src) s = src.dup out = [] t = [] while (c = s.shift) if c.is_a? String out.push c t.push c next end while s[0].is_a? Array c2 = s.shift x = [] if c.size * c2.size > 100000 # 候補の組み合わせ数 out.push(c, c2) out.concat s return out end product(c, c2) do |a| e = Mojibake.encode([t, a].join) e.delete_suffix!('・'.u) x.push a.join if @src.b.start_with? e.b end c = x end if s[0].is_a? String c2 = s[0] x = [] product(c, [c2]) do |a| e = Mojibake.encode([t, a].join) e.delete_suffix!('・'.u) x.push a[0] if @src.b.start_with? e.b end else x = c end out.push x.empty? ? c : x.size == 1 ? x[0] : x t.push x[0] end out end # 複数候補がある文字を「(n)」に置き換える # @param src [Array>] def phase4(src) out = [] src.each do |c| if c.size > 100 out.concat zip(*c.map(&:chars)).map{_1.uniq.sort} else out.push c end end hint = {} i = 1 out = out.map do |c| if c.is_a? String c else x = "(#{i})" hint[x] = c i += 1 x end end @hint = hint out.join.scrub{|c| '\x'+c.b.ord.to_s(16).upcase} end # same as Enumerator.product def product(*x, &) if x.size == 1 x[0].map{[_1]}.each(&) else x[0].product(*x[1..], &) end end def zip(a, *x) a.zip(*x) end def utf8_code self.class.utf8_code end def candidate(e) list = [] re = Regexp.new('\A(.*)'+e.b.gsub(/\0/, '(.+)')+'(.*)\z') utf8_code.each do |b, u| next unless re =~ b unless ($3 || $2).empty? m = $1 + $2 + $3.to_s next if m.force_encoding('cp932').encode('utf-8') rescue nil next if m.b[0].force_encoding('cp932').encode('utf-8') rescue nil end list.push u end list end end end