--- Reading and querying simple tabular data.
-- This provides a way of creating basic SQL-like queries.
-- See the Guide -- @class module -- @name pl.data local stringx = require 'pl.stringx' local utils = require 'pl.utils' local seq = require 'pl.seq' local tablex = require 'pl.tablex' local List = require 'pl.list'.List local rstrip,count = stringx.rstrip,stringx.count local _DEBUG = rawget(_G,'_DEBUG') local throw,patterns,choose,function_arg,split = utils.throw,utils.patterns,utils.choose,utils.function_arg,utils.split local append,concat = table.insert,table.concat local map,find = tablex.map,tablex.find local gsub = string.gsub local io = io local _G,print,loadstring,type,tonumber,ipairs,setmetatable,pcall,error,setfenv = _G,print,loadstring,type,tonumber,ipairs,setmetatable,pcall,error,setfenv module ('pl.data',utils._module) local parse_select local DataMT = { column_by_name = function(self,name) return seq.copy(query(self,name)) end, copy_query = function(self,condn) condn = parse_select(condn,self) local res = seq.copy_tuples(query(self,condn)) res.delim = self.delim return new(res,List.split(condn.fields,',')) end, column_names = function(self) return self.fieldnames end, } DataMT.__index = DataMT -- [guessing delimiter] We check for comma, tab and spaces in that order. -- [issue] any other delimiters to be checked? local function guess_delim (line) if count(line,',') > 0 then return ',' elseif count(line,'\t') > 0 then return '\t' elseif count(line,' ') > 0 then return '%s+' else return ' ' end end -- [file parameter] If it's a string, we try open as a filename. If nil, then -- either stdin or stdout depending on the mode. Otherwise, check if this is -- a file-like object (implements read or write depending) local function open_file (f,mode) local opened local reading = mode == 'r' if type(f) == 'string' then f,err = io.open(f,mode) if not f then return throw(err) end opened = true end if f and ((reading and not f.read) or (not reading and not f.write)) then return throw "not a file-like object" end return (f or (reading and io.stdin or io.stdout)),nil,opened end local function all_n () end --- read a delimited file in a Lua table. -- By default, attempts to treat first line as separated list of fieldnames. -- @param file a filename or a file-like object (default stdin) -- @param cnfg options table: can override delim (a string pattern), fieldnames (a list), -- specify no_convert (default is to convert), numfields (indices of columns known -- to be numbers) and thousands_dot (thousands separator in Excel CSV is '.') function read(file,cnfg) local list = seq.list local convert,err,opened local data = {} if not cnfg then cnfg = {} end local f,err,opened = open_file(file,'r') if not f then return throw (err) end local thousands_dot = cnfg.thousands_dot local function try_tonumber(x) if thousands_dot then x = x:gsub('%.(...)','%1') end return tonumber(x) end local line = f:read() if not line then return throw "empty file" end -- first question: what is the delimiter? data.delim = cnfg.delim and cnfg.delim or guess_delim(line) local delim = data.delim local collect_end = cnfg.last_field_collect -- first line will usually be field names. Unless fieldnames are specified, -- we check if it contains purely numerical values for the case of reading -- plain data files. if not cnfg.fieldnames then local fields = List.split(line,delim) local nums = map(tonumber,fields) if #nums == #fields then convert = tonumber append(data,nums) else cnfg.fieldnames = fields end line = f:read() elseif type(cnfg.fieldnames) == 'string' then cnfg.fieldnames = List.split(cnfg.fieldnames,delim) end -- at this point, the column headers have been read in. If the first -- row consisted of numbers, it has already been added to the dataset. local numfields = cnfg.numfields if cnfg.fieldnames then data.fieldnames = cnfg.fieldnames -- [conversion] unless @no_convert, we need the numerical field indices -- of the first data row. Can also be specified by @numfields. if not cnfg.no_convert then if not numfields then numfields = List() local fields = split(line,data.delim) for i = 1,#fields do if tonumber(fields[i]) then numfields:append(i) end end end if #numfields > 0 then -- there are numerical fields -- note that using dot as the thousands separator (@thousands_dot) -- requires a special conversion function! convert = thousands_dot and try_tonumber or tonumber end end end local N = #data.fieldnames -- keep going until finished while line do if not line:find ('^%s*$') then local fields = split(line,delim) if convert then for i in list(numfields) do local val = convert(fields[i]) if val == nil then return throw ("not a number: "..fields[i]) else fields[i] = val end end end -- [collecting end field] If @last_field_collect then we will collect -- all extra space-delimited fields into a single last field. if collect_end and #fields > N then local ends = List(fields):slice(N):join ' ' tablex.icopy(fields,{ends},N) --*note* copy end append(data,fields) end line = f:read() end if opened then f:close() end if delim == '%s+' then data.delim = ' ' end return new(data) end local function write_row (data,f,row) f:write(List.join(row,data.delim),'\n') end DataMT.write_row = write_row local function write (data,file) local f,err,opened = open_file(file,'w') if not f then return throw (err) end f:write(data.fieldnames:join(data.delim),'\n') for i = 1,#data do write_row(data,f,data[i]) end if opened then f:close() end end DataMT.write = write local function massage_fieldnames (fields) -- [fieldnames must be valid Lua identifiers] fix 0.8 was %A for i = 1,#fields do fields[i] = fields[i]:gsub('%W','_') end end --- create a new dataset from a table of rows.
-- Can specify the fieldnames, else the table must have a field called -- 'fieldnames', which is either a string of comma-separated names, -- or a table of names. -- @param data the table. -- @param fieldnames optional fieldnames -- @return the table. function new (data,fieldnames) data.fieldnames = data.fieldnames or fieldnames if not data.delim and type(data.fieldnames) == 'string' then data.delim = guess_delim(data.fieldnames) data.fieldnames = split(data.fieldnames,data.delim) end data.fieldnames = List(data.fieldnames) massage_fieldnames(data.fieldnames) setmetatable(data,DataMT) -- a query with just the fieldname will return a sequence -- of values, which seq.copy turns into a table. return data end local sorted_query = [[ return function (t) local i = 0 local v local ls = {} for i,v in ipairs(t) do if CONDITION then ls[#ls+1] = v end end table.sort(ls,function(v1,v2) return SORT_EXPR end) local n = #ls return function() i = i + 1 v = ls[i] if i > n then return end return FIELDLIST end end ]] -- question: is this optimized case actually worth the extra code? local simple_query = [[ return function (t) local n = #t local i = 0 local v return function() repeat i = i + 1 v = t[i] until i > n or CONDITION if i > n then return end return FIELDLIST end end ]] local function is_string (s) return type(s) == 'string' end local field_error function fieldnames_as_string (data) return concat(data.fieldnames,',') end local function massage_fields(data,f) local idx = find(data.fieldnames,f) if idx then return 'v['..idx..']' else field_error = f..' not found in '..fieldnames_as_string(data) return f end end local function process_select (data,parms) --- preparing fields ---- local res,ret field_error = nil if parms.fields:find '^%s*%*%s*' then parms.fields = fieldnames_as_string(data) end local fields = rstrip(parms.fields):gsub('[^,%w]','_') -- non-identifier chars local massage_fields = utils.bind1(massage_fields,data) ret = gsub(fields,patterns.IDEN,massage_fields) if field_error then return throw(field_error) end parms.proc_fields = ret parms.where = parms.where or 'true' if is_string(parms.where) then parms.where = gsub(parms.where,patterns.IDEN,massage_fields) field_error = nil end return true end parse_select = function(s,data) local endp local parms = {} local w1,w2 = s:find('where ') local s1,s2 = s:find('sort by ') if w1 then -- where clause! endp = (s1 or 0)-1 parms.where = s:sub(w2+1,endp) end if s1 then -- sort by clause (must be last!) parms.sort_by = s:sub(s2+1) end endp = (w1 or s1 or 0)-1 parms.fields = s:sub(1,endp) local status,err = process_select(data,parms) if not status then return throw(err) else return parms end end --- create a query iterator from a select string. -- Select string has this format:
-- FIELDLIST [ where LUA-CONDN [ sort by FIELD] ]
-- FIELDLISt is a comma-separated list of valid fields, or '*'.

-- The condition can also be a table, with fields 'fields' (comma-sep string or -- table), 'sort_by' (string) and 'where' (Lua expression string or function) -- @param data table produced by read -- @param condn select string or table -- @param context a list of tables to be searched when resolving functions -- @param return_row if true, wrap the results in a row table -- @return an iterator over the specified fields function query(data,condn,context,return_row) local err if is_string(condn) then condn,err = parse_select(condn,data) if not condn then return throw(err) end elseif type(condn) == 'table' then if type(condn.fields) == 'table' then condn.fields = concat(condn.fields,',') end if not condn.proc_fields then local status,err = process_select(data,condn) if not status then return throw(err) end end else return throw "condition must be a string or a table" end local query if condn.sort_by then -- use sorted_query query = sorted_query else query = simple_query end local fields = condn.proc_fields or condn.fields if return_row then fields = '{'..fields..'}' end query,k = query:gsub('FIELDLIST',fields) if is_string(condn.where) then query = query:gsub('CONDITION',condn.where) condn.where = nil else query = query:gsub('CONDITION','_condn(v)') condn.where = function_arg(condn.where) end if condn.sort_by then local expr,sort_var,sort_dir local sort_by = condn.sort_by local i1,i2 = sort_by:find('%s+') if i1 then sort_var,sort_dir = sort_by:sub(1,i1-1),sort_by:sub(i2+1) else sort_var = sort_by sort_dir = 'asc' end sort_var = massage_fields(data,sort_var) if field_error then return throw(field_error) end if sort_dir == 'asc' then sort_dir = '<' else sort_dir = '>' end expr = ('%s %s %s'):format(sort_var:gsub('v','v1'),sort_dir,sort_var:gsub('v','v2')) query = query:gsub('SORT_EXPR',expr) end if condn.where then query = 'return function(_condn) '..query..' end' end if _DEBUG then print(query) end local fn,err = loadstring(query,'tmp') if not fn then return throw(err) end fn = fn() -- get the function if condn.where then fn = fn(condn.where) end local qfun = fn(data) if context then -- [specifying context for condition] @context is a list of tables which are -- 'injected'into the condition's custom context append(context,_G) local lookup = {} setfenv(qfun,lookup) setmetatable(lookup,{ __index = function(tbl,key) -- _G.print(tbl,key) for k,t in ipairs(context) do if t[key] then return t[key] end end end }) end return qfun end DataMT.select = query DataMT.select_row = function(data,condn,context) return query(data,condn,context,true) end --- Filter input using a query. -- @param Q a query string -- @param file a file-like object -- @param dont_fail true if you want to return an error, not just fail function filter (Q,file,dont_fail) local err local d = read(file) local iter,err = d:select(Q) local delim = d.delim if not iter then err = 'error: '..err if dont_fail then return nil,err else utils.quit(1,err) end end while true do local res = {iter()} if #res == 0 then break end print(concat(res,delim)) end end