--- Reading and querying simple tabular data. -- -- data.read 'test.txt' -- ==> {{10,20},{2,5},{40,50},fieldnames={'x','y'},delim=','} -- -- Provides a way of creating basic SQL-like queries. -- -- require 'pl' -- local d = data.read('xyz.txt') -- local q = d:select('x,y,z where x > 3 and z < 2 sort by y') -- for x,y,z in q do -- print(x,y,z) -- end -- -- See @{06-data.md.Reading_Columnar_Data|the Guide} -- -- Dependencies: `pl.utils`, `pl.array2d` (fallback methods) -- @module pl.data local utils = require 'pl.utils' local _DEBUG = rawget(_G,'_DEBUG') local patterns,function_arg,usplit = utils.patterns,utils.function_arg,utils.split local append,concat = table.insert,table.concat local gsub = string.gsub local io = io local _G,print,type,tonumber,ipairs,setmetatable,pcall,error,setfenv = _G,print,type,tonumber,ipairs,setmetatable,pcall,error,setfenv local data = {} local parse_select local function count(s,chr) chr = utils.escape(chr) local _,cnt = s:gsub(chr,' ') return cnt end local function rstrip(s) return s:gsub('%s+$','') end local function make_list(l) return setmetatable(l,utils.stdmt.List) end local function split(s,delim) return make_list(usplit(s,delim)) end local function map(fun,t) local res = {} for i = 1,#t do append(res,fun(t[i])) end return res end local function find(t,v) for i = 1,#t do if v == t[i] then return i end end end local DataMT = { column_by_name = function(self,name) if type(name) == 'number' then name = '$'..name end local arr = {} for res in data.query(self,name) do append(arr,res) end return make_list(arr) end, copy_select = function(self,condn) condn = parse_select(condn,self) local iter = data.query(self,condn) local res = {} local row = make_list{iter()} while #row > 0 do append(res,row) row = make_list{iter()} end res.delim = self.delim return data.new(res,split(condn.fields,',')) end, column_names = function(self) return self.fieldnames end, } local array2d DataMT.__index = function(self,name) local f = DataMT[name] if f then return f end if not array2d then array2d = require 'pl.array2d' end return array2d[name] end --- return a particular column as a list of values (method). -- @param name either name of column, or numerical index. -- @function Data.column_by_name --- return a query iterator on this data (method). -- @param condn the query expression -- @function Data.select -- @see data.query --- return a row iterator on this data (method). -- @param condn the query expression -- @function Data.select_row --- return a new data object based on this query (method). -- @param condn the query expression -- @function Data.copy_select --- return the field names of this data object (method). -- @function Data.column_names --- write out a row (method). -- @param f file-like object -- @function Data.write_row --- write data out to file (method). -- @param f file-like object -- @function Data.write -- [guessing delimiter] We check for comma, tab and spaces in that order. -- [issue] any other delimiters to be checked? local delims = {',','\t',' ',';'} local function guess_delim (line) if line=='' then return ' ' end for _,delim in ipairs(delims) do if count(line,delim) > 0 then return delim == ' ' and '%s+' or delim end end return ' ' end -- [file parameter] If it's a string, we try open as a filename. If nil, then -- either stdin or stdout depending on the mode. Otherwise, check if this is -- a file-like object (implements read or write depending) local function open_file (f,mode) local opened, err local reading = mode == 'r' if type(f) == 'string' then if f == 'stdin' then f = io.stdin elseif f == 'stdout' then f = io.stdout else f,err = io.open(f,mode) if not f then return nil,err end opened = true end end if f and ((reading and not f.read) or (not reading and not f.write)) then return nil, "not a file-like object" end return f,nil,opened end local function all_n () end --- read a delimited file in a Lua table. -- By default, attempts to treat first line as separated list of fieldnames. -- @param file a filename or a file-like object (default stdin) -- @param cnfg options table: can override delim (a string pattern), fieldnames (a list), -- specify no_convert (default is to convert), numfields (indices of columns known -- to be numbers) and thousands_dot (thousands separator in Excel CSV is '.') function data.read(file,cnfg) local convert,err,opened local D = {} if not cnfg then cnfg = {} end local f,err,opened = open_file(file,'r') if not f then return nil, err end local thousands_dot = cnfg.thousands_dot local function try_tonumber(x) if thousands_dot then x = x:gsub('%.(...)','%1') end return tonumber(x) end local line = f:read() if not line then return nil, "empty file" end -- first question: what is the delimiter? D.delim = cnfg.delim and cnfg.delim or guess_delim(line) local delim = D.delim local collect_end = cnfg.last_field_collect local numfields = cnfg.numfields -- some space-delimited data starts with a space. This should not be a column, -- although it certainly would be for comma-separated, etc. local strip if delim == '%s+' and line:find(delim) == 1 then strip = function(s) return s:gsub('^%s+','') end line = strip(line) end -- first line will usually be field names. Unless fieldnames are specified, -- we check if it contains purely numerical values for the case of reading -- plain data files. if not cnfg.fieldnames then local fields = split(line,delim) local nums = map(tonumber,fields) if #nums == #fields then convert = tonumber append(D,nums) numfields = {} for i = 1,#nums do numfields[i] = i end else cnfg.fieldnames = fields end line = f:read() if strip then line = strip(line) end elseif type(cnfg.fieldnames) == 'string' then cnfg.fieldnames = split(cnfg.fieldnames,delim) end -- at this point, the column headers have been read in. If the first -- row consisted of numbers, it has already been added to the dataset. if cnfg.fieldnames then D.fieldnames = cnfg.fieldnames -- [conversion] unless @no_convert, we need the numerical field indices -- of the first data row. Can also be specified by @numfields. if not cnfg.no_convert then if not numfields then numfields = {} local fields = split(line,D.delim) for i = 1,#fields do if tonumber(fields[i]) then append(numfields,i) end end end if #numfields > 0 then -- there are numerical fields -- note that using dot as the thousands separator (@thousands_dot) -- requires a special conversion function! convert = thousands_dot and try_tonumber or tonumber end end end -- keep going until finished while line do if not line:find ('^%s*$') then if strip then line = strip(line) end local fields = split(line,delim) if convert then for k = 1,#numfields do local i = numfields[k] local val = convert(fields[i]) if val == nil then return nil, "not a number: "..fields[i] else fields[i] = val end end end -- [collecting end field] If @last_field_collect then we will collect -- all extra space-delimited fields into a single last field. if collect_end and #fields > #D.fieldnames then local ends,N = {},#D.fieldnames for i = N+1,#fields do append(ends,fields[i]) end ends = concat(ends,' ') local cfields = {} for i = 1,N do cfields[i] = fields[i] end cfields[N] = cfields[N]..' '..ends fields = cfields end append(D,fields) end line = f:read() end if opened then f:close() end if delim == '%s+' then D.delim = ' ' end if not D.fieldnames then D.fieldnames = {} end return data.new(D) end local function write_row (data,f,row,delim) f:write(concat(row,delim),'\n') end function DataMT:write_row(f,row) write_row(self,f,row,self.delim) end --- write 2D data to a file. -- Does not assume that the data has actually been -- generated with `new` or `read`. -- @param data 2D array -- @param file filename or file-like object -- @param fieldnames list of fields (optional) -- @param delim delimiter (default tab) function data.write (data,file,fieldnames,delim) local f,err,opened = open_file(file,'w') if not f then return nil, err end if fieldnames and #fieldnames > 0 then f:write(concat(data.fieldnames,delim),'\n') end delim = delim or '\t' for i = 1,#data do write_row(data,f,data[i],delim) end if opened then f:close() end end function DataMT:write(file) data.write(self,file,self.fieldnames,self.delim) end local function massage_fieldnames (fields) -- fieldnames must be valid Lua identifiers; ignore any surrounding padding for i = 1,#fields do fields[i] = rstrip(fields[i]):gsub('^%s*',''):gsub('%W','_') end end --- create a new dataset from a table of rows. -- Can specify the fieldnames, else the table must have a field called -- 'fieldnames', which is either a string of delimiter-separated names, -- or a table of names.
-- If the table does not have a field called 'delim', then an attempt will be -- made to guess it from the fieldnames string, defaults otherwise to tab. -- @param d the table. -- @param fieldnames optional fieldnames -- @return the table. function data.new (d,fieldnames) d.fieldnames = d.fieldnames or fieldnames or '' if not d.delim and type(d.fieldnames) == 'string' then d.delim = guess_delim(d.fieldnames) d.fieldnames = split(d.fieldnames,d.delim) end d.fieldnames = make_list(d.fieldnames) massage_fieldnames(d.fieldnames) setmetatable(d,DataMT) -- a query with just the fieldname will return a sequence -- of values, which seq.copy turns into a table. return d end local sorted_query = [[ return function (t) local i = 0 local v local ls = {} for i,v in ipairs(t) do if CONDITION then ls[#ls+1] = v end end table.sort(ls,function(v1,v2) return SORT_EXPR end) local n = #ls return function() i = i + 1 v = ls[i] if i > n then return end return FIELDLIST end end ]] -- question: is this optimized case actually worth the extra code? local simple_query = [[ return function (t) local n = #t local i = 0 local v return function() repeat i = i + 1 v = t[i] until i > n or CONDITION if i > n then return end return FIELDLIST end end ]] local function is_string (s) return type(s) == 'string' end local field_error local function fieldnames_as_string (data) return concat(data.fieldnames,',') end local function massage_fields(data,f) local idx if f:find '^%d+$' then idx = tonumber(f) else idx = find(data.fieldnames,f) end if idx then return 'v['..idx..']' else field_error = f..' not found in '..fieldnames_as_string(data) return f end end local function process_select (data,parms) --- preparing fields ---- local res,ret field_error = nil local fields = parms.fields local numfields = fields:find '%$' or #data.fieldnames == 0 if fields:find '^%s*%*%s*' then if not numfields then fields = fieldnames_as_string(data) else local ncol = #data[1] fields = {} for i = 1,ncol do append(fields,'$'..i) end fields = concat(fields,',') end end local idpat = patterns.IDEN if numfields then idpat = '%$(%d+)' else -- massage field names to replace non-identifier chars fields = rstrip(fields):gsub('[^,%w]','_') end local massage_fields = utils.bind1(massage_fields,data) ret = gsub(fields,idpat,massage_fields) if field_error then return nil,field_error end parms.fields = fields parms.proc_fields = ret parms.where = parms.where or 'true' if is_string(parms.where) then parms.where = gsub(parms.where,idpat,massage_fields) field_error = nil end return true end parse_select = function(s,data) local endp local parms = {} local w1,w2 = s:find('where ') local s1,s2 = s:find('sort by ') if w1 then -- where clause! endp = (s1 or 0)-1 parms.where = s:sub(w2+1,endp) end if s1 then -- sort by clause (must be last!) parms.sort_by = s:sub(s2+1) end endp = (w1 or s1 or 0)-1 parms.fields = s:sub(1,endp) local status,err = process_select(data,parms) if not status then return nil,err else return parms end end --- create a query iterator from a select string. -- Select string has this format:
-- FIELDLIST [ where LUA-CONDN [ sort by FIELD] ]
-- FIELDLIST is a comma-separated list of valid fields, or '*'.

-- The condition can also be a table, with fields 'fields' (comma-sep string or -- table), 'sort_by' (string) and 'where' (Lua expression string or function) -- @param data table produced by read -- @param condn select string or table -- @param context a list of tables to be searched when resolving functions -- @param return_row if true, wrap the results in a row table -- @return an iterator over the specified fields, or nil -- @return an error message function data.query(data,condn,context,return_row) local err if is_string(condn) then condn,err = parse_select(condn,data) if not condn then return nil,err end elseif type(condn) == 'table' then if type(condn.fields) == 'table' then condn.fields = concat(condn.fields,',') end if not condn.proc_fields then local status,err = process_select(data,condn) if not status then return nil,err end end else return nil, "condition must be a string or a table" end local query, k if condn.sort_by then -- use sorted_query query = sorted_query else query = simple_query end local fields = condn.proc_fields or condn.fields if return_row then fields = '{'..fields..'}' end query,k = query:gsub('FIELDLIST',fields) if is_string(condn.where) then query = query:gsub('CONDITION',condn.where) condn.where = nil else query = query:gsub('CONDITION','_condn(v)') condn.where = function_arg(0,condn.where,'condition.where must be callable') end if condn.sort_by then local expr,sort_var,sort_dir local sort_by = condn.sort_by local i1,i2 = sort_by:find('%s+') if i1 then sort_var,sort_dir = sort_by:sub(1,i1-1),sort_by:sub(i2+1) else sort_var = sort_by sort_dir = 'asc' end if sort_var:match '^%$' then sort_var = sort_var:sub(2) end sort_var = massage_fields(data,sort_var) if field_error then return nil,field_error end if sort_dir == 'asc' then sort_dir = '<' else sort_dir = '>' end expr = ('%s %s %s'):format(sort_var:gsub('v','v1'),sort_dir,sort_var:gsub('v','v2')) query = query:gsub('SORT_EXPR',expr) end if condn.where then query = 'return function(_condn) '..query..' end' end if _DEBUG then print(query) end local fn,err = loadstring(query,'tmp') if not fn then return nil,err end fn = fn() -- get the function if condn.where then fn = fn(condn.where) end local qfun = fn(data) if context then -- [specifying context for condition] @context is a list of tables which are -- 'injected'into the condition's custom context append(context,_G) local lookup = {} setfenv(qfun,lookup) setmetatable(lookup,{ __index = function(tbl,key) -- _G.print(tbl,key) for k,t in ipairs(context) do if t[key] then return t[key] end end end }) end return qfun end DataMT.select = data.query DataMT.select_row = function(d,condn,context) return data.query(d,condn,context,true) end --- Filter input using a query. -- @param Q a query string -- @param infile filename or file-like object -- @param outfile filename or file-like object -- @param dont_fail true if you want to return an error, not just fail function data.filter (Q,infile,outfile,dont_fail) local err local d = data.read(infile or 'stdin') local out = open_file(outfile or 'stdout') local iter,err = d:select(Q) local delim = d.delim if not iter then err = 'error: '..err if dont_fail then return nil,err else utils.quit(1,err) end end while true do local res = {iter()} if #res == 0 then break end out:write(concat(res,delim),'\n') end end return data