--utf8 module (Cosmin Apreutesei, public domain).
--byte indices are i's, char (codepoint) indices are ci's.
--invalid characters are counted as 1-byte chars so they don't get lost. validate/sanitize beforehand as needed.
local glue = require'glue' --for autoload
local utf8 = {}

--byte index of the next char after the char at byte index i, followed by a valid flag for the char at byte index i.
--nil if not found. invalid characters are iterated as 1-byte chars.
function utf8.next_raw(s, i)
	if not i then
		if #s == 0 then return nil end
		return 1, true --fake flag (doesn't matter since this flag is not to be taken as full validation)
	end
	if i > #s then return end
	local c = s:byte(i)
	if c >= 0x00 and c <= 0x7F then
		i = i + 1
	elseif c >= 0xC2 and c <= 0xDF then
		i = i + 2
	elseif c >= 0xE0 and c <= 0xEF then
		i = i + 3
	elseif c >= 0xF0 and c <= 0xF4 then
		i = i + 4
	else --invalid
		return i + 1, false
	end
	if i > #s then return end
	return i, true
end

--next() is the generic iterator and can be replaced for different semantics. next_raw() must preserve its semantics.
utf8.next = utf8.next_raw

--iterate chars, returning the byte index where each char starts
function utf8.byte_indices(s, previ)
	return utf8.next, s, previ
end

--number of chars in string
function utf8.len(s)
	local len = 0
	for _ in utf8.byte_indices(s) do
		len = len + 1
	end
	return len
end

--byte index given char index. nil if the index is outside the string.
function utf8.byte_index(s, target_ci)
	if target_ci < 1 then return end
	local ci = 0
	for i in utf8.byte_indices(s) do
		ci = ci + 1
		if ci == target_ci then
			return i
		end
	end
	assert(target_ci > ci, 'invalid index')
end

--char index given byte index. nil if the index is outside the string.
function utf8.char_index(s, target_i)
	if target_i < 1 or target_i > #s then return end
	local ci = 0
	for i in utf8.byte_indices(s) do
		ci = ci + 1
		if i == target_i then
			return ci
		end
	end
	error'invalid index'
end

--byte index of the prev. char before the char at byte index i, which defaults to #s + 1.
--nil if the index is outside the 2..#s+1 range.
--NOTE: unlike next(), this is a O(N) operation!
function utf8.prev(s, nexti)
	nexti = nexti or #s + 1
	if nexti <= 1 or nexti > #s + 1 then return end
	local lasti, lastvalid = utf8.next(s)
	for i, valid in utf8.byte_indices(s) do
		if i == nexti then
			return lasti, lastvalid
		end
		lasti, lastvalid = i, valid
	end
	if nexti == #s + 1 then
		return lasti, lastvalid
	end
	error'invalid index'
end

--iterate chars in reverse order, returning the byte index where each char starts.
function utf8.byte_indices_reverse(s, nexti)
	if #s < 200 then
		--using prev() is a O(N^2/2) operation, ok for small strings (200 chars need 40,000 iterations)
		return utf8.prev, s, nexti
	else
		--store byte indices in a table and iterate them in reverse.
		--this is 40x slower than byte_indices() but still fast at 2mil chars/second (but eats RAM and makes garbage).
		local t = {}
		for i in utf8.byte_indices(s) do
			if nexti and i >= nexti then break end
			table.insert(t, i)
		end
		local i = #t + 1
		return function()
			i = i - 1
			return t[i]
		end
	end
end

--sub based on char indices, which, unlike with standard string.sub(), can't be negative.
--start_ci can be 1..inf and end_ci can be 0..inf. end_ci can be nil meaning last char.
--if start_ci is out of range or end_ci < start_ci, the empty string is returned.
--if end_ci is out of range, it is considered to be the last position in the string.
function utf8.sub(s, start_ci, end_ci)
	--assert for positive indices because we might implement negative indices in the future.
	assert(start_ci >= 1)
	assert(not end_ci or end_ci >= 0)
	local ci = 0
	local start_i = 1
	local end_i = s:len()
	for i in utf8.byte_indices(s) do
		ci = ci + 1
		if ci == start_ci then
			start_i = i
		end
		if ci == end_ci + 1 then
			end_i = i - 1
			break
		end
	end
	if not start_i then
		assert(start_ci > ci, 'invalid index')
		return ''
	end
	if end_ci and not end_i then
		if end_ci < start_ci then
			return ''
		end
		assert(end_ci > ci, 'invalid index')
	end
	return s:sub(start_i, end_i)
end

--check if a string contains a substring at byte index i without making garbage.
--nil if the index is out of range. true if searching for the empty string.
function utf8.contains(s, i, sub)
	if i < 1 or i > #s then return nil end
	for si = 1, #sub do
		if s:byte(i + si - 1) ~= sub:byte(si) then
			return false
		end
	end
	return true
end

--count the number of occurences of a substring in a string. the substring cannot be the empty string.
function utf8.count(s, sub)
	assert(#sub > 0)
	local count = 0
	local i = 1
	while i do
		if utf8.contains(s, i, sub) then
			count = count + 1
			i = i + #sub
			if i > #s then break end
		else
			i = utf8.next(s, i)
		end
	end
	return count
end

--utf8 validation and sanitization

--check if there's a valid utf8 codepoint at byte index i. valid ranges for each utf8 byte are:
-- byte  1          2           3          4
--------------------------------------------
-- 00 - 7F
-- C2 - DF    80 - BF
-- E0         A0 - BF     80 - BF
-- E1 - EC    80 - BF     80 - BF
-- ED         80 - 9F     80 - BF
-- EE - EF    80 - BF     80 - BF
-- F0         90 - BF     80 - BF    80 - BF
-- F1 - F3    80 - BF     80 - BF    80 - BF
-- F4         80 - 8F     80 - BF    80 - BF
function utf8.isvalid(s, i)
	local c = s:byte(i)
	if not c then
		return false
	elseif c >= 0x00 and c <= 0x7F then
		return true
	elseif c >= 0xC2 and c <= 0xDF then
		local c2 = s:byte(i + 1)
		return c2 and c2 >= 0x80 and c2 <= 0xBF
	elseif c >= 0xE0 and c <= 0xEF then
		local c2 = s:byte(i + 1)
		local c3 = s:byte(i + 2)
		if c == 0xE0 then
			return c2 and c3 and
				c2 >= 0xA0 and c2 <= 0xBF and
				c3 >= 0x80 and c3 <= 0xBF
		elseif c >= 0xE1 and c <= 0xEC then
			return c2 and c3 and
				c2 >= 0x80 and c2 <= 0xBF and
				c3 >= 0x80 and c3 <= 0xBF
		elseif c == 0xED then
			return c2 and c3 and
				c2 >= 0x80 and c2 <= 0x9F and
				c3 >= 0x80 and c3 <= 0xBF
		elseif c >= 0xEE and c <= 0xEF then
			if c == 0xEF and c2 == 0xBF and (c3 == 0xBE or c3 == 0xBF) then
				return false --uFFFE and uFFFF non-characters
			end
			return c2 and c3 and
				c2 >= 0x80 and c2 <= 0xBF and
				c3 >= 0x80 and c3 <= 0xBF
		end
	elseif c >= 0xF0 and c <= 0xF4 then
		local c2 = s:byte(i + 1)
		local c3 = s:byte(i + 2)
		local c4 = s:byte(i + 3)
		if c == 0xF0 then
			return c2 and c3 and c4 and
				c2 >= 0x90 and c2 <= 0xBF and
				c3 >= 0x80 and c3 <= 0xBF and
				c4 >= 0x80 and c4 <= 0xBF
		elseif c >= 0xF1 and c <= 0xF3 then
			return c2 and c3 and c4 and
				c2 >= 0x80 and c2 <= 0xBF and
				c3 >= 0x80 and c3 <= 0xBF and
				c4 >= 0x80 and c4 <= 0xBF
		elseif c == 0xF4 then
			return c2 and c3 and c4 and
				c2 >= 0x80 and c2 <= 0x8F and
				c3 >= 0x80 and c3 <= 0xBF and
				c4 >= 0x80 and c4 <= 0xBF
		end
	end
	return false
end

--byte index of the next valid utf8 char after the char at byte index i.
--nil if indices go out of range. invalid characters are skipped.
function utf8.next_valid(s, i)
	local valid
	i, valid = utf8.next_raw(s, i)
	while i and (not valid or not utf8.isvalid(s, i)) do
		i, valid = utf8.next(s, i)
	end
	return i
end

--iterate valid chars, returning the byte index where each char starts
function utf8.valid_byte_indices(s)
	return utf8.next_valid, s
end

--assert that a string only contains valid utf8 characters
function utf8.validate(s)
	for i, valid in utf8.byte_indices(s) do
		if not valid or not utf8.isvalid(s, i) then
			error(string.format('invalid utf8 char at #%d', i))
		end
	end
end

local function table_lookup(s, i, j, t)
	return t[s:sub(i, j)]
end

--replace characters in string based on a function f(s, i, j, ...) -> replacement_string | nil
function utf8.replace(s, f, ...)
	if type(f) == 'table' then
		return utf8.replace(s, table_lookup, f)
	end
	if s == '' then
		return s
	end
	local t = {}
	local lasti = 1
	for i in utf8.byte_indices(s) do
		local nexti = utf8.next(s, i) or #s + 1
		local repl = f(s, i, nexti - 1, ...)
		if repl then
			table.insert(t, s:sub(lasti, i - 1))
			table.insert(t, repl)
			lasti = nexti
		end
	end
	table.insert(t, s:sub(lasti))
	return table.concat(t)
end

local function replace_invalid(s, i, j, repl_char)
	if not utf8.isvalid(s, i) then
		return repl_char
	end
end

--replace invalid utf8 chars with a replacement char
function utf8.sanitize(s, repl_char)
	repl_char = repl_char or '�' --\uFFFD
	return utf8.replace(s, replace_invalid, repl_char)
end


if not ... then require 'utf8_test' end

return glue.autoload(utf8, {
	upper = 'utf8_case',
	lower = 'utf8_case',
})