require("unicode")

local utf8 = unicode.utf8

unicode.string = string -- for tests unicode[ctype]
local sprintf = string.format
local function printf (fmt, ...) return print(sprintf(fmt, ...)) end

local function check (test, ok, got)
 if ok == got then return printf("ok  %s = %s",test,ok) end
 return printf("NOK %s = %s GOT '%s'",test, ok, got or "<nil>")
end
local function checka (test, ok, ...)
 local arg = {...}
 arg[1] = arg[1] or ""
 return check(test, ok, table.concat(arg, ","))
end


local function testlen (str,bytes,codes,chars)
 codes = codes or bytes
 chars = chars or codes
 return check(sprintf("len '%s'", str),
  sprintf("%d/%d/%d", bytes, codes, chars),
sprintf("%d/%d/%d", string.len(str), utf8.len(str), unicode.grapheme.len(str)))
end

-- 176 = 00B0;DEGREE SIGN -- UTF-8: C2,B0 = \194\176
-- 196 = 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
-- 214 = 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
-- 776 = 0308;COMBINING DIAERESIS -- UTF-8: CC,88 = \204\136
testlen("A\tB",3) -- plain Latin-1
testlen("\176\196\214",3) -- plain Latin-1
testlen("\196\176\214",3,2) -- C4,B0 is valid seq 0130 I WITH DOT ABOVE
testlen("\192\178",2) -- C0,B2 is bad seq for 2
testlen("°ÄÖ",6,3) -- simple Latin-1 chars in UTF-8
testlen("\204\136A\204\136O\204\136",8,5,3) -- decomposed (with broken lead)


local function testsub (ctype,ok,str,start,e)
 return check(sprintf("%s.sub('%s',%d,%d)", ctype, str, start, e), ok,
  unicode[ctype].sub(str,start,e))
end
testsub("ascii","BCD","ABCDE",2,4)
testsub("utf8","BCD","ABCDE",2,4)
testsub("latin1","Ä","°ÄÖ",3,4)
testsub("utf8","Ä","°ÄÖ",2,2)
testsub("utf8","ÄÖ","°ÄÖ",2,-1)
testsub("utf8","\204\136","A\204\136O\204\136",2,2) -- decomposed
testsub("grapheme","O\204\136","A\204\136O\204\136",2,2) -- decomposed


local function testbyte (ctype, ok, str, ...)
 return checka(sprintf("%s.byte('%s',%s)",ctype,str,table.concat({...}, ",")),
  ok, unicode[ctype].byte(str, ...))
end
testbyte("string","194,176","Ä°Ö",3,4) -- the UTF-8 seq for °
testbyte("ascii","194,176","Ä°Ö",3,4)
testbyte("utf8","176,214","Ä°Ö",2,3) -- code points for °,Ö
testbyte("utf8","65,776","\204\136A\204\136O\204\136",2,3) -- decomposed
testbyte("grapheme","65,776","\204\136A\204\136O\204\136",2) -- decomposed


local function testchar (ctype, ok, ...)
 return check(sprintf("%s.char(%s)",ctype,table.concat({...}, ",")),
  ok, unicode[ctype].char(...))
end
testchar("ascii", "AB", 65,66)
testchar("ascii", "\176", 176)
testchar("utf8", "\194\176", 176)


local function testcase (ctype,str,up,lo)
 check(sprintf("%s.lower('%s')", ctype, str), lo, unicode[ctype].lower(str))
 check(sprintf("%s.upper('%s')", ctype, str), up, unicode[ctype].upper(str))
end
-- upper/lower also fixes plain Latin
testcase("utf8","Ab\196üo\204\136","ABÄÜO\204\136","abäüo\204\136")
testcase("ascii","Ab\196üo\204\136","AB\196üO\204\136","ab\196üo\204\136")
testcase("latin1","Ab\196","AB\196","ab\228")


local function testrev (ctype,ok,str)
 return check(sprintf("%s.reverse('%s')",ctype,str),
  ok, unicode[ctype].reverse(str))
end
testrev("ascii","b\136\204oa\176\194ba","ab°ao\204\136b");
testrev("utf8","b\204\136oa°ba","ab°ao\204\136b");
testrev("grapheme","bo\204\136a°ba","ab°ao\204\136b");

 

local function testfind (ctype,ok,str,pat)
 return checka(sprintf("%s.find('%s','%s')",ctype,str,pat),
  ok, unicode[ctype].find(str, pat))
end
testfind("ascii","1,1","e=mc2","%a")
testfind("ascii","3,4","e=mc2","%a%a")
testfind("ascii","5,5","e=mc2","%d")
testfind("ascii","","Ä","%a")
testfind("ascii","1,2","Ä","%A*")
testfind("latin1","1,1","Ä","%a")
testfind("utf8","1,2","Ä","%a")
testfind("utf8","1,1","o\204\136","%a*")
testfind("utf8","2,3","o\204\136","%A")
testfind("utf8","1,1","o\204\136",".")
testfind("grapheme","1,3","o\204\136","%a*")
testfind("grapheme","2,3","o\204\136","%A") -- didn't expect this?
testfind("grapheme","1,3","o\204\136",".")
testfind("utf8","4,5","ÜHÄPPY","[À-Ö]")
testfind("utf8","4,5","ÜHÄPPY","[Ä-]")
testfind("utf8","7,7","ÜHÄP-PY","[ä-]")
testfind("ascii","1,4","abcdef","%a*d")
testfind("utf8","1,10","äöüßü","%a*ü")
testfind("utf8","1,6","äöüß","%a*ü")
testfind("utf8","4,5,Ä","ÜHÄPPY","([À-Ö])")
testfind("utf8","1,5,ÜHÄ","ÜHÄ_PPY","([%w]+)")
testfind("utf8","1,9,ÜHÄ_PPY","ÜHÄ_PPY","([%w_]+)")


local function testgsub (ctype,ok,str,pat,repl)
 return check(sprintf("%s.gsub('%s','%s','%s')",ctype,str,pat,repl),
  ok, unicode[ctype].gsub(str,pat,repl))
end
testgsub("ascii","hello hello world world","hello world", "(%w+)", "%1 %1")
testgsub("ascii","world hello Lua from",
 "hello world from Lua", "(%w+)%s*(%w+)", "%2 %1")
testgsub("ascii","l helö wöfr rldöL müä",
 "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
testgsub("utf8","wörld hellö Lüä fröm",
 "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
testgsub("utf8","HÜppÄ","HÄppÜ","([À-Ö])(%l*)(%u)","%3%2%1")


fail = 0
for i=0,65535 do if i ~= utf8.byte(utf8.char(i)) then fail=fail+1 end end
check("code-decode failures", 0, fail)

--[[ print the table
for i=192,65535,64 do
 local k = i/64
 io.write(sprintf("%04x\\%3d\\%3d ",i, 224+k/64, 128+math.mod(k,64)))
 for j=i,i+63 do
  io.write(utf8.char(j))
 end
 io.write("\n")
end
]]