--_DEBUG=true data = require 'pl.data' List = require 'pl.List' array = require 'pl.array2d' seq = require 'pl.seq' utils = require 'pl.utils' stringio = require 'pl.stringio' open = stringio. open asserteq = require 'pl.test' . asserteq T = require 'pl.test'. tuple -- tab-separated data, explicit column names t1f = open [[ EventID Magnitude LocationX LocationY LocationZ LocationError EventDate DataFile 981124001 2.0 18988.4 10047.1 4149.7 33.8 24/11/1998 11:18:05 981124DF.AAB 981125001 0.8 19104.0 9970.4 5088.7 3.0 25/11/1998 05:44:54 981125DF.AAB 981127003 0.5 19012.5 9946.9 3831.2 46.0 27/11/1998 17:15:17 981127DF.AAD 981127005 0.6 18676.4 10606.2 3761.9 4.4 27/11/1998 17:46:36 981127DF.AAF 981127006 0.2 19109.9 9716.5 3612.0 11.8 27/11/1998 19:29:51 981127DF.AAG ]] t1 = data.read (t1f) -- column_by_name returns a List asserteq(t1:column_by_name 'Magnitude',List{2,0.8,0.5,0.6,0.2}) -- can use array.column as well asserteq(array.column(t1,2),{2,0.8,0.5,0.6,0.2}) -- only numerical columns (deduced from first data row) are converted by default -- can look up indices in the list fieldnames. EDI = t1.fieldnames:index 'EventDate' assert(type(t1[1][EDI]) == 'string') -- select method returns a sequence, in this case single-valued. -- (Note that seq.copy returns a List) asserteq(seq(t1:select 'LocationX where Magnitude > 0.5'):copy(),List{18988.4,19104,18676.4}) --[[ --a common select usage pattern: for event,mag in t1:select 'EventID,Magnitude sort by Magnitude desc' do print(event,mag) end --]] -- space-separated, but with last field containing spaces. t2f = open [[ USER PID %MEM %CPU COMMAND sdonovan 2333 0.3 0.1 background --n=2 root 2332 0.4 0.2 fred --start=yes root 2338 0.2 0.1 backyard-process ]] t2,err = data.read(t2f,{last_field_collect=true}) if not t2 then return print (err) end -- the last_field_collect option is useful with space-delimited data where the last -- field may contain spaces. Otherwise, a record count mismatch should be an error! lt2 = List(t2[2]) asserteq(lt2:join ',','root,2332,0.4,0.2,fred --start=yes') -- fieldnames are converted into valid identifiers by substituting _ -- (we do this to make select queries parseable by Lua) asserteq(t2.fieldnames,List{'USER','PID','_MEM','_CPU','COMMAND'}) -- select queries are NOT SQL so remember to use == ! (and no 'between' operator, sorry) --s,err = t2:select('_MEM where USER="root"') --assert(err == [[[string "tmp"]:9: unexpected symbol near '=']]) s = t2:select('_MEM where USER=="root"') assert(s() == 0.4) assert(s() == 0.2) assert(s() == nil) -- CSV, Excel style t3f = open [[ Department Name,Employee ID,Project,Hours Booked sales,1231,overhead,4 sales,1255,overhead,3 engineering,1501,development,5 engineering,1501,maintenance,3 engineering,1433,maintenance,10 ]] t3 = data.read(t3f) -- a common operation is to select using a given list of columns, and each row -- on some explicit condition. The select() method can take a table with these -- parameters keepcols = {'Employee_ID','Hours_Booked'} q = t3:select { fields = keepcols, where = function(row) return row[1]=='engineering' end } asserteq(seq.copy2(q),{{1501,5},{1501,3},{1433,10}}) -- another pattern is doing a select to restrict rows & columns, process some -- fields and write out the modified rows. utils.import 'pl.func' outf = stringio.create() names = {[1501]='don',[1433]='dilbert'} t3:write_row (outf,{'Employee','Hours_Booked'}) q = t3:select_row {fields=keepcols,where=Eq(_1[1],'engineering')} for row in q do row[1] = names[row[1]] t3:write_row(outf,row) end asserteq(outf:value(), [[ Employee,Hours_Booked don,5 don,3 dilbert,10 ]]) -- data may not always have column headers. When creating a data object -- from a two-dimensional array, must specify the fieldnames, as a list or a string. -- The delimiter is deduced from the fieldname string, so a string just containing -- the delimiter will set it, and the fieldnames will be empty. local dat = List() local row = List.range(1,10) for i = 1,10 do dat:append(row:map('*',i)) end dat = data.new(dat,',') local out = stringio.create() dat:write(out,',') asserteq(out:value(), [[ 1,2,3,4,5,6,7,8,9,10 2,4,6,8,10,12,14,16,18,20 3,6,9,12,15,18,21,24,27,30 4,8,12,16,20,24,28,32,36,40 5,10,15,20,25,30,35,40,45,50 6,12,18,24,30,36,42,48,54,60 7,14,21,28,35,42,49,56,63,70 8,16,24,32,40,48,56,64,72,80 9,18,27,36,45,54,63,72,81,90 10,20,30,40,50,60,70,80,90,100 ]]) -- you can always use numerical field indices, AWK-style; -- note how the copy_select method gives you a data object instead of an -- iterator over the fields local res = dat:copy_select '$1,$3 where $1 > 5' local L = List asserteq(L(res),L{ L{6, 18}, L{7,21}, L{8,24}, L{9,27}, L{10,30}, }) -- the column_by_name method may take a fieldname or an index asserteq(dat:column_by_name(2), L{2,4,6,8,10,12,14,16,18,20}) -- the field list may contain expressions or even constants local q = dat:select '$3,2*$4 where $1 == 8' asserteq(T(q()),T(24,64)) dat = data.read(open [[ 1.0 0.1 0.2 1.3 ]]) -- if a method cannot be found, then we look up in array2d -- array2d.flatten(t) makes a 1D list out of a 2D array, -- and then List.minmax() gets the extrema. asserteq(T(dat:flatten():minmax()),T(0.1,1.3))