require("unicode") local utf8 = unicode.utf8 unicode.string = string -- for tests unicode[ctype] local sprintf = string.format local function printf (fmt, ...) return print(sprintf(fmt, ...)) end local function check (test, ok, got) if ok == got then return printf("ok %s = %s",test,ok) end return printf("NOK %s = %s GOT '%s'",test, ok, got or "") end local function checka (test, ok, ...) local arg = {...} arg[1] = arg[1] or "" return check(test, ok, table.concat(arg, ",")) end local function testlen (str,bytes,codes,chars) codes = codes or bytes chars = chars or codes return check(sprintf("len '%s'", str), sprintf("%d/%d/%d", bytes, codes, chars), sprintf("%d/%d/%d", string.len(str), utf8.len(str), unicode.grapheme.len(str))) end -- 176 = 00B0;DEGREE SIGN -- UTF-8: C2,B0 = \194\176 -- 196 = 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS -- 214 = 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS -- 776 = 0308;COMBINING DIAERESIS -- UTF-8: CC,88 = \204\136 testlen("A\tB",3) -- plain Latin-1 testlen("\176\196\214",3) -- plain Latin-1 testlen("\196\176\214",3,2) -- C4,B0 is valid seq 0130 I WITH DOT ABOVE testlen("\192\178",2) -- C0,B2 is bad seq for 2 testlen("°ÄÖ",6,3) -- simple Latin-1 chars in UTF-8 testlen("\204\136A\204\136O\204\136",8,5,3) -- decomposed (with broken lead) local function testsub (ctype,ok,str,start,e) return check(sprintf("%s.sub('%s',%d,%d)", ctype, str, start, e), ok, unicode[ctype].sub(str,start,e)) end testsub("ascii","BCD","ABCDE",2,4) testsub("utf8","BCD","ABCDE",2,4) testsub("latin1","Ä","°ÄÖ",3,4) testsub("utf8","Ä","°ÄÖ",2,2) testsub("utf8","ÄÖ","°ÄÖ",2,-1) testsub("utf8","\204\136","A\204\136O\204\136",2,2) -- decomposed testsub("grapheme","O\204\136","A\204\136O\204\136",2,2) -- decomposed local function testbyte (ctype, ok, str, ...) return checka(sprintf("%s.byte('%s',%s)",ctype,str,table.concat({...}, ",")), ok, unicode[ctype].byte(str, ...)) end testbyte("string","194,176","İÖ",3,4) -- the UTF-8 seq for ° testbyte("ascii","194,176","İÖ",3,4) testbyte("utf8","176,214","İÖ",2,3) -- code points for °,Ö testbyte("utf8","65,776","\204\136A\204\136O\204\136",2,3) -- decomposed testbyte("grapheme","65,776","\204\136A\204\136O\204\136",2) -- decomposed local function testchar (ctype, ok, ...) return check(sprintf("%s.char(%s)",ctype,table.concat({...}, ",")), ok, unicode[ctype].char(...)) end testchar("ascii", "AB", 65,66) testchar("ascii", "\176", 176) testchar("utf8", "\194\176", 176) local function testcase (ctype,str,up,lo) check(sprintf("%s.lower('%s')", ctype, str), lo, unicode[ctype].lower(str)) check(sprintf("%s.upper('%s')", ctype, str), up, unicode[ctype].upper(str)) end -- upper/lower also fixes plain Latin testcase("utf8","Ab\196üo\204\136","ABÄÜO\204\136","abäüo\204\136") testcase("ascii","Ab\196üo\204\136","AB\196üO\204\136","ab\196üo\204\136") testcase("latin1","Ab\196","AB\196","ab\228") local function testrev (ctype,ok,str) return check(sprintf("%s.reverse('%s')",ctype,str), ok, unicode[ctype].reverse(str)) end testrev("ascii","b\136\204oa\176\194ba","ab°ao\204\136b"); testrev("utf8","b\204\136oa°ba","ab°ao\204\136b"); testrev("grapheme","bo\204\136a°ba","ab°ao\204\136b"); local function testfind (ctype,ok,str,pat) return checka(sprintf("%s.find('%s','%s')",ctype,str,pat), ok, unicode[ctype].find(str, pat)) end testfind("ascii","1,1","e=mc2","%a") testfind("ascii","3,4","e=mc2","%a%a") testfind("ascii","5,5","e=mc2","%d") testfind("ascii","","Ä","%a") testfind("ascii","1,2","Ä","%A*") testfind("latin1","1,1","Ä","%a") testfind("utf8","1,2","Ä","%a") testfind("utf8","1,1","o\204\136","%a*") testfind("utf8","2,3","o\204\136","%A") testfind("utf8","1,1","o\204\136",".") testfind("grapheme","1,3","o\204\136","%a*") testfind("grapheme","2,3","o\204\136","%A") -- didn't expect this? testfind("grapheme","1,3","o\204\136",".") testfind("utf8","4,5","ÜHÄPPY","[À-Ö]") testfind("utf8","4,5","ÜHÄPPY","[Ä-]") testfind("utf8","7,7","ÜHÄP-PY","[ä-]") testfind("ascii","1,4","abcdef","%a*d") testfind("utf8","1,10","äöüßü","%a*ü") testfind("utf8","1,6","äöüß","%a*ü") testfind("utf8","4,5,Ä","ÜHÄPPY","([À-Ö])") testfind("utf8","1,5,ÜHÄ","ÜHÄ_PPY","([%w]+)") testfind("utf8","1,9,ÜHÄ_PPY","ÜHÄ_PPY","([%w_]+)") local function testgsub (ctype,ok,str,pat,repl) return check(sprintf("%s.gsub('%s','%s','%s')",ctype,str,pat,repl), ok, unicode[ctype].gsub(str,pat,repl)) end testgsub("ascii","hello hello world world","hello world", "(%w+)", "%1 %1") testgsub("ascii","world hello Lua from", "hello world from Lua", "(%w+)%s*(%w+)", "%2 %1") testgsub("ascii","l helö wöfr rldöL müä", "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1") testgsub("utf8","wörld hellö Lüä fröm", "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1") testgsub("utf8","HÜppÄ","HÄppÜ","([À-Ö])(%l*)(%u)","%3%2%1") fail = 0 for i=0,65535 do if i ~= utf8.byte(utf8.char(i)) then fail=fail+1 end end check("code-decode failures", 0, fail) --[[ print the table for i=192,65535,64 do local k = i/64 io.write(sprintf("%04x\\%3d\\%3d ",i, 224+k/64, 128+math.mod(k,64))) for j=i,i+63 do io.write(utf8.char(j)) end io.write("\n") end ]]