diff --git a/Project.toml b/Project.toml index 68ecaaa..ee12dfb 100644 --- a/Project.toml +++ b/Project.toml @@ -5,12 +5,10 @@ version = "1.0.1-DEV" [deps] DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" -ExcelReaders = "c04bee98-12a5-510c-87df-2a230cb6e075" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" IterableTables = "1c8ee90f-4401-5389-894e-7a04a3dc0f4d" IteratorInterfaceExtensions = "82899510-4779-5014-852e-03e436cf321d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" -PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" TableShowUtils = "5e66a065-1f0a-5976-b372-e0b8c017ca10" TableTraits = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" TableTraitsUtils = "382cd787-c1b6-5bf2-a167-d5b971a19bda" @@ -18,15 +16,13 @@ XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" [compat] DataValues = "0.4.11" -ExcelReaders = "0.11" FileIO = "1" IterableTables = "0.8.3, 0.9, 0.10, 0.11, 1" IteratorInterfaceExtensions = "0.1.1, 1" -PyCall = "1.90" TableShowUtils = "0.2" TableTraits = "0.3.1, 0.4, 1" TableTraitsUtils = "0.3, 0.4, 1" -XLSX = "0.4.1, 0.5, 0.6, 0.7, 0.8, 0.9" +XLSX = "0.10, 0.11" julia = "1" [extras] diff --git a/README.md b/README.md index f175400..6f037d6 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,18 @@ ## Overview -This package provides load support for Excel files under the +This package provides support for Excel files under the [FileIO.jl](https://github.com/JuliaIO/FileIO.jl) package. +It provides functionality to read simple tabular data from +an Excel (.xlsx) file and to save simple tabular data to an +Excel file. + +For more extensive functionality when reading and writing Excel files, +consider using [XLSX.jl](https://felipenoris.github.io/XLSX.jl/stable/). +Under the hood, `ExcelFiles.jl` uses the `XLSX.jl` functions `readtable` +and `writetable`. + ## Installation Use ``Pkg.add("ExcelFiles")`` in Julia to install ExcelFiles and its dependencies. @@ -18,17 +27,17 @@ Use ``Pkg.add("ExcelFiles")`` in Julia to install ExcelFiles and its dependencie ### Load an Excel file -To read a Excel file into a ``DataFrame``, use the following julia code: +To read an Excel file into a `DataFrame`, use the following julia code: -````julia +```julia using ExcelFiles, DataFrames df = DataFrame(load("data.xlsx", "Sheet1")) -```` +``` -The call to ``load`` returns a ``struct`` that is an [IterableTable.jl](https://github.com/queryverse/IterableTables.jl), so it can be passed to any function that can handle iterable tables, i.e. all the sinks in [IterableTable.jl](https://github.com/queryverse/IterableTables.jl). Here are some examples of materializing an Excel file into data structures that are not a ``DataFrame``: +The call to `load` returns an object that is an [IterableTable.jl](https://github.com/queryverse/IterableTables.jl), so it can be passed to any function that can handle iterable tables, i.e. all the sinks in [IterableTable.jl](https://github.com/queryverse/IterableTables.jl). Here are some examples of materializing an Excel file into data structures that are not a `DataFrame`: -````julia +```julia using ExcelFiles, DataTables, IndexedTables, TimeSeries, Temporal, Gadfly # Load into a DataTable @@ -45,46 +54,83 @@ ts = TS(load("data.xlsx", "Sheet1")) # Plot directly with Gadfly plot(load("data.xlsx", "Sheet1"), x=:a, y=:b, Geom.line) -```` +``` + +The `load` function takes a number of arguments and keywords: + +```julia + FileIO.load( + source::String, + [sheet::String, + [columns::String]]; + [first_row::Int], + [column_labels::Vector{String}], + [header::Bool], + [normalizenames::Bool] + ) +``` -The ``load`` function also takes a number of parameters: - -````julia -function load(f::FileIO.File{FileIO.format"Excel"}, range; keywords...) -```` #### Arguments: -* ``range``: either the name of the sheet in the Excel file to read, or a full Excel range specification (i.e. "Sheetname!A1:B2"). -* The ``keywords`` arguments are the same as in [ExcelReaders.jl](https://github.com/queryverse/ExcelReaders.jl) (which is used under the hood to read Excel files). When ``range`` is a sheet name, the keyword arguments for the ``readxlsheet`` function from ExcelReaders.jl apply, if ``range`` is a range specification, the keyword arguments for the ``readxl`` function apply. +* `source`: The name of the file to be loaded. +* `sheet`: Specifies the sheet name to be loaded. If `sheet` is not given, the first Excel sheet in the file will be used. +* `columns`: Determines which columns to read. For example, "B:D" will select columns B, C and D. If columns is not given, the algorithm will find the first sequence of consecutive non-empty cells. A valid sheet **must** be specified when specifying columns. + +#### Keywords: + +* `first_row`: Indicates the first row of the data table to be read. For example, `first_row=5` will look for a table starting at sheet row 5. If first_row is not given, the algorithm will look for the first non-empty row in the sheet. +* `header`: Indicates if the first row is a header. If `header=true` and `column_labels` is not specified, the column labels for the table will be read from the first row of the table. If `header=false` and `column_labels` is not specified, the algorithm will generate column labels. The default value is `header=true`. +* `column_labels`: Specifies column names for the header of the table. If `column_labels` are given and `header=true`, the headers given by `column_labels` will be used, and the first row of the table (containing headers) will be ignored. +* `normalizenames`: Set to `true` to normalize column names to valid Julia identifiers. Default=`false` ### Save an Excel file The following code saves any iterable table as an excel file: -````julia + +```julia using ExcelFiles save("output.xlsx", it) -```` -This will work as long as it is any of the types supported as sources in IterableTables.jl. +``` +This will work as long as it is any of the types supported as sources in IterableTables.jl (such as a `DataFrame`). + +The `save` function takes a number of arguments and keywords: + +```julia + FileIO.save( + source::String; + [sheetname::String], + [overwrite::Bool] + ) +``` + +#### Arguments: + +* `source`: The name of the file to be created on save. + +#### Keywords: + +* `sheetname`: Specify the sheetname to be used in the created file. By default, the sheetname will be `Sheet1`. +* `overwrite`: Set `overwrite=true` to overwite any existing file of the same name. Default = `false`. ### Using the pipe syntax -``load`` also support the pipe syntax. For example, to load an Excel file into a ``DataFrame``, one can use the following code: +The `load` and `save` functions also support the pipe syntax. For example, to load an Excel file into a `DataFrame`, one can use the following code: -````julia +```julia using ExcelFiles, DataFrame df = load("data.xlsx", "Sheet1") |> DataFrame -```` +``` To save an iterable table, one can use the following form: -````julia +```julia using ExcelFiles, DataFrame df = # Aquire a DataFrame somehow df |> save("output.xlsx") -```` +``` -The pipe syntax is especially useful when combining it with [Query.jl](https://github.com/queryverse/Query.jl) queries, for example one can easily load an Excel file, pipe it into a query, then pipe it to the ``save`` function to store the results in a new file. +The pipe syntax is especially useful when combining it with [Query.jl](https://github.com/queryverse/Query.jl) queries, for example one can easily load an Excel file, pipe it into a query, then pipe it to the `save` function to store the results in a new file. diff --git a/data/TestData.xlsx b/data/TestData.xlsx new file mode 100644 index 0000000..d188f4e Binary files /dev/null and b/data/TestData.xlsx differ diff --git a/docs/src/index.md b/docs/src/index.md index e10b99d..1a79d63 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1 +1,124 @@ # Introduction + +This package provides support for Excel files under the +[FileIO.jl](https://github.com/JuliaIO/FileIO.jl) package. + +It provides functionality to read simple tabular data from +an Excel (.xlsx) file and to save simple tabular data to an +Excel file. + +For more extensive functionality when reading and writing Excel files, +consider using [XLSX.jl](https://felipenoris.github.io/XLSX.jl/stable/). +Under the hood, `ExcelFiles.jl` uses the `XLSX.jl` functions `readtable` +and `writetable`. + +# Usage + +## Load an Excel file + +To read an Excel file into a `DataFrame`, use the following julia code: + +```julia +using ExcelFiles, DataFrames + +df = DataFrame(load("data.xlsx", "Sheet1")) +``` + +The call to `load` returns an object that is an [IterableTable.jl](https://github.com/queryverse/IterableTables.jl), so it can be passed to any function that can handle iterable tables, i.e. all the sinks in [IterableTable.jl](https://github.com/queryverse/IterableTables.jl). Here are some examples of materializing an Excel file into data structures that are not a `DataFrame`: + +```julia +using ExcelFiles, DataTables, IndexedTables, TimeSeries, Temporal, Gadfly + +# Load into a DataTable +dt = DataTable(load("data.xlsx", "Sheet1")) + +# Load into an IndexedTable +it = IndexedTable(load("data.xlsx", "Sheet1")) + +# Load into a TimeArray +ta = TimeArray(load("data.xlsx", "Sheet1")) + +# Load into a TS +ts = TS(load("data.xlsx", "Sheet1")) + +# Plot directly with Gadfly +plot(load("data.xlsx", "Sheet1"), x=:a, y=:b, Geom.line) +``` + +The `load` function takes a number of arguments and keywords: + +```julia + FileIO.load( + source::String, + [sheet::String, + [columns::String]]; + [first_row::Int], + [column_labels::Vector{String}], + [header::Bool], + [normalizenames::Bool] + ) +``` + +### Arguments: + +* `source`: The name of the file to be loaded. +* `sheet`: Specifies the sheet name to be loaded. If `sheet` is not given, the first Excel sheet in the file will be used. +* `columns`: Determines which columns to read. For example, "B:D" will select columns B, C and D. If columns is not given, the algorithm will find the first sequence of consecutive non-empty cells. A valid sheet **must** be specified when specifying columns. + +### Keywords: + +* `first_row`: Indicates the first row of the data table to be read. For example, `first_row=5` will look for a table starting at sheet row 5. If first_row is not given, the algorithm will look for the first non-empty row in the sheet. +* `header`: Indicates if the first row is a header. If `header=true` and `column_labels` is not specified, the column labels for the table will be read from the first row of the table. If `header=false` and `column_labels` is not specified, the algorithm will generate column labels. The default value is `header=true`. +* `column_labels`: Specifies column names for the header of the table. If `column_labels` are given and `header=true`, the headers given by `column_labels` will be used, and the first row of the table (containing headers) will be ignored. +* `normalizenames`: Set to `true` to normalize column names to valid Julia identifiers. Default=`false`. + +## Save an Excel file + +The following code saves any iterable table as an excel file: +```julia +using ExcelFiles + +save("output.xlsx", it) +``` +This will work as long as it is any of the types supported as sources in IterableTables.jl (such as a `DataFrame`). + +The `save` function takes a number of arguments and keywords: + +```julia + FileIO.save( + source::String; + [sheetname::String], + [overwrite::Bool] + ) +``` + +#### Arguments: + +* `source`: The name of the file to be created on save. + +#### Keywords: + +* `sheetname`: Specify the sheetname to be used in the created file. By default, the sheetname will be `Sheet1`. +* `overwrite`: Set `overwrite=true` to overwite any existing file of the same name. Default = `false`. + +## Using the pipe syntax + +The `load` and `save` functions also support the pipe syntax. For example, to load an Excel file into a `DataFrame`, one can use the following code: + +```julia +using ExcelFiles, DataFrame + +df = load("data.xlsx", "Sheet1") |> DataFrame +``` + +To save an iterable table, one can use the following form: + +```julia +using ExcelFiles, DataFrame + +df = # Aquire a DataFrame somehow + +df |> save("output.xlsx") +``` + +The pipe syntax is especially useful when combining it with [Query.jl](https://github.com/queryverse/Query.jl) queries, for example one can easily load an Excel file, pipe it into a query, then pipe it to the `save` function to store the results in a new file. diff --git a/src/ExcelFiles.jl b/src/ExcelFiles.jl index 9b7eb6a..bce2a06 100644 --- a/src/ExcelFiles.jl +++ b/src/ExcelFiles.jl @@ -1,7 +1,7 @@ module ExcelFiles -using ExcelReaders, XLSX, IteratorInterfaceExtensions, TableTraits, DataValues +using XLSX, IteratorInterfaceExtensions, TableTraits, DataValues using TableTraitsUtils, FileIO, TableShowUtils, Dates, Printf import IterableTables @@ -9,7 +9,8 @@ export load, save, File, @format_str struct ExcelFile filename::String - range::String + sheet::Union{Nothing,String} + columns::Union{Nothing,String} keywords end @@ -29,100 +30,43 @@ end Base.Multimedia.showable(::MIME"application/vnd.dataresource+json", source::ExcelFile) = true -function fileio_load(f::FileIO.File{FileIO.format"Excel"}, range; keywords...) - return ExcelFile(f.filename, range, keywords) +function fileio_load(f::FileIO.File{FileIO.format"Excel", String}, sheet, columns; kw...) + return ExcelFile(f.filename, sheet, columns, kw) +end +function fileio_load(f::FileIO.File{FileIO.format"Excel", String}, sheet; kw...) + return ExcelFile(f.filename, sheet, nothing, kw) +end +function fileio_load(f::FileIO.File{FileIO.format"Excel", String}; kw...) + return ExcelFile(f.filename, nothing, nothing, kw) end -function fileio_save(f::FileIO.File{FileIO.format"Excel"}, data; sheetname::AbstractString="") +function fileio_save(f::FileIO.File{FileIO.format"Excel"}, data; kw...) cols, colnames = TableTraitsUtils.create_columns_from_iterabletable(data, na_representation=:missing) - return XLSX.writetable(f.filename, cols, colnames; sheetname=sheetname) + return XLSX.writetable(f.filename, cols, colnames; kw...) end IteratorInterfaceExtensions.isiterable(x::ExcelFile) = true TableTraits.isiterabletable(x::ExcelFile) = true -function gennames(n::Integer) - res = Vector{Symbol}(undef, n) - for i in 1:n - res[i] = Symbol(@sprintf "x%d" i) - end - return res -end - -function _readxl(file::ExcelReaders.ExcelFile, sheetname::AbstractString, startrow::Integer, startcol::Integer, endrow::Integer, endcol::Integer; header::Bool=true, colnames::Vector{Symbol}=Symbol[]) - data = ExcelReaders.readxl_internal(file, sheetname, startrow, startcol, endrow, endcol) - - nrow, ncol = size(data) - - if length(colnames) == 0 - if header - headervec = data[1, :] - NAcol = map(i -> isa(i, DataValues.DataValue) && DataValues.isna(i), headervec) - headervec[NAcol] = gennames(count(!iszero, NAcol)) - - # This somewhat complicated conditional makes sure that column names - # that are integer numbers end up without an extra ".0" as their name - colnames = [isa(i, AbstractFloat) ? ( modf(i)[1] == 0.0 ? Symbol(Int(i)) : Symbol(string(i)) ) : Symbol(i) for i in vec(headervec)] +function _readxl(file::ExcelFile) + if isnothing(file.columns) + if isnothing(file.sheet) + table=XLSX.readtable(file.filename, "Sheet1"; file.keywords...) else - colnames = gennames(ncol) + table=XLSX.readtable(file.filename, file.sheet; file.keywords...) end - elseif length(colnames) != ncol - error("Length of colnames must equal number of columns in selected range") + else + table=XLSX.readtable(file.filename, file.sheet, file.columns; file.keywords...) end - - columns = Array{Any}(undef, ncol) - - for i = 1:ncol - if header - vals = data[2:end,i] - else - vals = data[:,i] - end - - # Check whether all non-NA values in this column - # are of the same type - type_of_el = length(vals) > 0 ? typeof(vals[1]) : Any - for val = vals - type_of_el = promote_type(type_of_el, typeof(val)) - end - - if type_of_el <: DataValue - columns[i] = convert(DataValueArray{eltype(type_of_el)}, vals) - - # TODO Check wether this hack is correct - for (j, v) in enumerate(columns[i]) - if v isa DataValue && !DataValues.isna(v) && v[] isa DataValue - columns[i][j] = v[] - end - end - else - columns[i] = convert(Array{type_of_el}, vals) - end + colnames=Vector{Symbol}(undef, length(table.data)) + for (k, v) in table.column_label_index + colnames[v] = Symbol(k) end - - return columns, colnames + return table.data, colnames end function IteratorInterfaceExtensions.getiterator(file::ExcelFile) - column_data, col_names = if occursin("!", file.range) - excelfile = openxl(file.filename) - - sheetname, startrow, startcol, endrow, endcol = ExcelReaders.convert_ref_to_sheet_row_col(file.range) - - _readxl(excelfile, sheetname, startrow, startcol, endrow, endcol; file.keywords...) - else - excelfile = openxl(file.filename) - sheet = excelfile.workbook.sheet_by_name(file.range) - - keywords = filter(i -> !(i[1] in (:header, :colnames)), file.keywords) - startrow, startcol, endrow, endcol = ExcelReaders.convert_args_to_row_col(sheet; keywords...) - - keywords2 = copy(file.keywords) - keywords2 = filter(i -> !(i[1] in (:skipstartrows, :skipstartcols, :nrows, :ncols)), file.keywords) - - _readxl(excelfile, file.range, startrow, startcol, endrow, endcol; keywords2...) - end - + column_data, col_names = _readxl(file) return create_tableiterator(column_data, col_names) end diff --git a/test/runtests.jl b/test/runtests.jl index d1d0372..0415b98 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,24 +1,36 @@ using ExcelFiles -using ExcelReaders using IteratorInterfaceExtensions using TableTraits using TableTraitsUtils using Dates +using XLSX using DataValues using DataFrames using Test +data_directory = joinpath(dirname(pathof(ExcelFiles)), "..", "data") +@assert isdir(data_directory) + @testset "ExcelFiles" begin - filename = normpath(dirname(pathof(ExcelReaders)), "..", "test", "TestData.xlsx") + filename = joinpath(data_directory, "TestData.xlsx") efile = load(filename, "Sheet1") - @test sprint((stream, data) -> show(stream, "text/html", data), efile) == "
Some Float64sSome StringsSome BoolsMixed columnMixed with NAFloat64 with NAString with NABool with NASome datesDates with NASome errorsErrors with NAColumn with NULL and then mixed
1.0"A"true2.09.03.0"FF"#NA2015-03-03T00:00:001965-04-03T00:00:00#DIV/0!#DIV/0!#NA
1.5"BB"false"EEEEE""III"#NA#NAtrue2015-02-04T10:14:001950-08-09T18:40:00#N/A#N/A3.4
2.0"CCC"falsefalse#NA3.5"GGG"#NA1988-04-09T00:00:0019:00:00#REF!#NAME?"HKEJW"
2.5"DDDD"true1.5true4.0"HHHH"false15:02:00#NA#NAME?#NA#NA
" + # XLSX.jl v0.10.4 + @test sprint((stream, data) -> show(stream, "text/html", data), efile) == "
Some Float64sSome StringsSome BoolsMixed columnMixed with NAFloat64 with NAString with NABool with NASome datesDates with NASome errorsErrors with NAColumn with NULL and then mixed
1"A"true293"FF"#NADate("2015-03-03")Date("1965-04-03")#NA#NA#NA
1.5"BB"false"EEEEE""III"#NA#NAtrue2015-02-04T10:14:001950-08-09T18:40:00#NA#NA3.4
2"CCC"falsefalse#NA3.5"GGG"#NADate("1988-04-09")19:00:00#NA#NA"HKEJW"
2.5"DDDD"true1.5true4"HHHH"false15:02:00#NA#NA#NA#NA
" + + # XLSX.jl v0.11.0 (default behaviour in `readtable` switches to `infer_eltypes=true` so the type eg. of Bools is inferred correctly) +# @test sprint((stream, data) -> show(stream, "text/html", data), efile) == "
Some Float64sSome StringsSome BoolsMixed columnMixed with NAFloat64 with NAString with NABool with NASome datesDates with NASome errorsErrors with NAColumn with NULL and then mixed
1.0"A"true293.0"FF"#NA2015-03-03Date("1965-04-03")#NA#NA#NA
1.5"BB"false"EEEEE""III"#NA#NAtrue2015-02-04T10:14:001950-08-09T18:40:00#NA#NA3.4
2.0"CCC"falsefalse#NA3.5"GGG"#NA1988-04-0919:00:00#NA#NA"HKEJW"
2.5"DDDD"true1.5true4.0"HHHH"false15:02:00#NA#NA#NA#NA
" + + # XLSX.jl v0.10.4 + @test sprint((stream, data) -> show(stream, "application/vnd.dataresource+json", data), efile) == "{\"schema\":{\"fields\":[{\"name\":\"Some Float64s\",\"type\":\"string\"},{\"name\":\"Some Strings\",\"type\":\"string\"},{\"name\":\"Some Bools\",\"type\":\"string\"},{\"name\":\"Mixed column\",\"type\":\"string\"},{\"name\":\"Mixed with NA\",\"type\":\"string\"},{\"name\":\"Float64 with NA\",\"type\":\"string\"},{\"name\":\"String with NA\",\"type\":\"string\"},{\"name\":\"Bool with NA\",\"type\":\"string\"},{\"name\":\"Some dates\",\"type\":\"string\"},{\"name\":\"Dates with NA\",\"type\":\"string\"},{\"name\":\"Some errors\",\"type\":\"string\"},{\"name\":\"Errors with NA\",\"type\":\"string\"},{\"name\":\"Column with NULL and then mixed\",\"type\":\"string\"}]},\"data\":[{\"Some Float64s\":1,\"Some Strings\":\"A\",\"Some Bools\":true,\"Mixed column\":2,\"Mixed with NA\":9,\"Float64 with NA\":3,\"String with NA\":\"FF\",\"Bool with NA\":null,\"Some dates\":\"2015-03-03\",\"Dates with NA\":\"1965-04-03\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":null},{\"Some Float64s\":1.5,\"Some Strings\":\"BB\",\"Some Bools\":false,\"Mixed column\":\"EEEEE\",\"Mixed with NA\":\"III\",\"Float64 with NA\":null,\"String with NA\":null,\"Bool with NA\":true,\"Some dates\":\"2015-02-04T10:14:00\",\"Dates with NA\":\"1950-08-09T18:40:00\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":3.4},{\"Some Float64s\":2,\"Some Strings\":\"CCC\",\"Some Bools\":false,\"Mixed column\":false,\"Mixed with NA\":null,\"Float64 with NA\":3.5,\"String with NA\":\"GGG\",\"Bool with NA\":null,\"Some dates\":\"1988-04-09\",\"Dates with NA\":\"19:00:00\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":\"HKEJW\"},{\"Some Float64s\":2.5,\"Some Strings\":\"DDDD\",\"Some Bools\":true,\"Mixed column\":1.5,\"Mixed with NA\":true,\"Float64 with NA\":4,\"String with NA\":\"HHHH\",\"Bool with NA\":false,\"Some dates\":\"15:02:00\",\"Dates with NA\":null,\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":null}]}" - @test sprint((stream, data) -> show(stream, "application/vnd.dataresource+json", data), efile) == "{\"schema\":{\"fields\":[{\"name\":\"Some Float64s\",\"type\":\"number\"},{\"name\":\"Some Strings\",\"type\":\"string\"},{\"name\":\"Some Bools\",\"type\":\"boolean\"},{\"name\":\"Mixed column\",\"type\":\"string\"},{\"name\":\"Mixed with NA\",\"type\":\"string\"},{\"name\":\"Float64 with NA\",\"type\":\"number\"},{\"name\":\"String with NA\",\"type\":\"string\"},{\"name\":\"Bool with NA\",\"type\":\"boolean\"},{\"name\":\"Some dates\",\"type\":\"string\"},{\"name\":\"Dates with NA\",\"type\":\"string\"},{\"name\":\"Some errors\",\"type\":\"string\"},{\"name\":\"Errors with NA\",\"type\":\"string\"},{\"name\":\"Column with NULL and then mixed\",\"type\":\"string\"}]},\"data\":[{\"Some Float64s\":1.0,\"Some Strings\":\"A\",\"Some Bools\":true,\"Mixed column\":2.0,\"Mixed with NA\":9.0,\"Float64 with NA\":3.0,\"String with NA\":\"FF\",\"Bool with NA\":null,\"Some dates\":\"2015-03-03T00:00:00\",\"Dates with NA\":\"1965-04-03T00:00:00\",\"Some errors\":{\"errorcode\":7},\"Errors with NA\":{\"errorcode\":7},\"Column with NULL and then mixed\":null},{\"Some Float64s\":1.5,\"Some Strings\":\"BB\",\"Some Bools\":false,\"Mixed column\":\"EEEEE\",\"Mixed with NA\":\"III\",\"Float64 with NA\":null,\"String with NA\":null,\"Bool with NA\":true,\"Some dates\":\"2015-02-04T10:14:00\",\"Dates with NA\":\"1950-08-09T18:40:00\",\"Some errors\":{\"errorcode\":42},\"Errors with NA\":{\"errorcode\":42},\"Column with NULL and then mixed\":3.4},{\"Some Float64s\":2.0,\"Some Strings\":\"CCC\",\"Some Bools\":false,\"Mixed column\":false,\"Mixed with NA\":null,\"Float64 with NA\":3.5,\"String with NA\":\"GGG\",\"Bool with NA\":null,\"Some dates\":\"1988-04-09T00:00:00\",\"Dates with NA\":\"19:00:00\",\"Some errors\":{\"errorcode\":23},\"Errors with NA\":{\"errorcode\":29},\"Column with NULL and then mixed\":\"HKEJW\"},{\"Some Float64s\":2.5,\"Some Strings\":\"DDDD\",\"Some Bools\":true,\"Mixed column\":1.5,\"Mixed with NA\":true,\"Float64 with NA\":4.0,\"String with NA\":\"HHHH\",\"Bool with NA\":false,\"Some dates\":\"15:02:00\",\"Dates with NA\":null,\"Some errors\":{\"errorcode\":29},\"Errors with NA\":null,\"Column with NULL and then mixed\":null}]}" + # XLSX.jl v0.11.0 (default behaviour in `readtable` switches to `infer_eltypes=true` so the type eg. of Bools is inferred correctly) +# @test sprint((stream, data) -> show(stream, "application/vnd.dataresource+json", data), efile) == "{\"schema\":{\"fields\":[{\"name\":\"Some Float64s\",\"type\":\"number\"},{\"name\":\"Some Strings\",\"type\":\"string\"},{\"name\":\"Some Bools\",\"type\":\"boolean\"},{\"name\":\"Mixed column\",\"type\":\"string\"},{\"name\":\"Mixed with NA\",\"type\":\"string\"},{\"name\":\"Float64 with NA\",\"type\":\"number\"},{\"name\":\"String with NA\",\"type\":\"string\"},{\"name\":\"Bool with NA\",\"type\":\"boolean\"},{\"name\":\"Some dates\",\"type\":\"string\"},{\"name\":\"Dates with NA\",\"type\":\"string\"},{\"name\":\"Some errors\",\"type\":\"string\"},{\"name\":\"Errors with NA\",\"type\":\"string\"},{\"name\":\"Column with NULL and then mixed\",\"type\":\"string\"}]},\"data\":[{\"Some Float64s\":1.0,\"Some Strings\":\"A\",\"Some Bools\":true,\"Mixed column\":2,\"Mixed with NA\":9,\"Float64 with NA\":3.0,\"String with NA\":\"FF\",\"Bool with NA\":null,\"Some dates\":\"2015-03-03\",\"Dates with NA\":\"1965-04-03\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":null},{\"Some Float64s\":1.5,\"Some Strings\":\"BB\",\"Some Bools\":false,\"Mixed column\":\"EEEEE\",\"Mixed with NA\":\"III\",\"Float64 with NA\":null,\"String with NA\":null,\"Bool with NA\":true,\"Some dates\":\"2015-02-04T10:14:00\",\"Dates with NA\":\"1950-08-09T18:40:00\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":3.4},{\"Some Float64s\":2.0,\"Some Strings\":\"CCC\",\"Some Bools\":false,\"Mixed column\":false,\"Mixed with NA\":null,\"Float64 with NA\":3.5,\"String with NA\":\"GGG\",\"Bool with NA\":null,\"Some dates\":\"1988-04-09\",\"Dates with NA\":\"19:00:00\",\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":\"HKEJW\"},{\"Some Float64s\":2.5,\"Some Strings\":\"DDDD\",\"Some Bools\":true,\"Mixed column\":1.5,\"Mixed with NA\":true,\"Float64 with NA\":4.0,\"String with NA\":\"HHHH\",\"Bool with NA\":false,\"Some dates\":\"15:02:00\",\"Dates with NA\":null,\"Some errors\":null,\"Errors with NA\":null,\"Column with NULL and then mixed\":null}]}" - @test sprint(show, efile) == "4x13 Excel file\nSome Float64s │ Some Strings │ Some Bools │ Mixed column │ Mixed with NA\n──────────────┼──────────────┼────────────┼──────────────┼──────────────\n1.0 │ A │ true │ 2.0 │ 9.0 \n1.5 │ BB │ false │ \"EEEEE\" │ \"III\" \n2.0 │ CCC │ false │ false │ #NA \n2.5 │ DDDD │ true │ 1.5 │ true \n... with 8 more columns: Float64 with NA, String with NA, Bool with NA, Some dates, Dates with NA, Some errors, Errors with NA, Column with NULL and then mixed" +# This test is truncated (... with 8 more columns:) so probably isn't robust - although it passes locally. +# @test sprint(show, efile) == "4x13 Excel file\nSome Float64s │ Some Strings │ Some Bools │ Mixed column │ Mixed with NA\n──────────────┼──────────────┼────────────┼──────────────┼──────────────\n1.0 │ A │ true │ 2 │ 9 \n1.5 │ BB │ false │ \"EEEEE\" │ \"III\" \n2.0 │ CCC │ false │ false │ #NA \n2.5 │ DDDD │ true │ 1.5 │ true \n... with 8 more columns: Float64 with NA, String with NA, Bool with NA, Some dates, Dates with NA, Some errors, Errors with NA, Column with NULL and then mixed" @test TableTraits.isiterabletable(efile) == true @test IteratorInterfaceExtensions.isiterable(efile) == true @@ -27,7 +39,7 @@ using Test @test isiterable(efile) == true - full_dfs = [create_columns_from_iterabletable(load(filename, "Sheet1!C3:O7")), create_columns_from_iterabletable(load(filename, "Sheet1"))] + full_dfs = [create_columns_from_iterabletable(load(filename, "Sheet1", "C:O"; first_row=3)), create_columns_from_iterabletable(load(filename, "Sheet1"))] for (df, names) in full_dfs @test length(df) == 13 @test length(df[1]) == 4 @@ -42,16 +54,13 @@ using Test @test df[8] == [NA, true, NA, false] @test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), Date(1988, 4, 9), Dates.Time(15, 2, 0)] @test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19, 0, 0), NA] - @test eltype(df[11]) == ExcelReaders.ExcelErrorCell - @test df[12][1][] isa ExcelReaders.ExcelErrorCell - @test df[12][2][] isa ExcelReaders.ExcelErrorCell - @test df[12][3][] isa ExcelReaders.ExcelErrorCell - @test df[12][4] == NA + @test df[11] == [DataValue(), DataValue(), DataValue(), DataValue()] + @test df[12] == [DataValue(), DataValue(), DataValue(), NA] @test df[13] == [NA, 3.4, "HKEJW", NA] end - df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false)) - @test names == [:x1,:x2,:x3,:x4,:x5,:x6,:x7,:x8,:x9,:x10,:x11,:x12,:x13] + df, names = create_columns_from_iterabletable(load(filename, "Sheet1", "C:O"; first_row=4, header=false)) + @test names == [:C, :D, :E, :F, :G, :H, :I, :J, :K, :L, :M, :N, :O] @test length(df[1]) == 4 @test length(df) == 13 @test df[1] == [1., 1.5, 2., 2.5] @@ -64,19 +73,14 @@ using Test @test df[8] == [NA, true, NA, false] @test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15, 2, 0)] @test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19, 0, 0), NA] - @test isa(df[11][1], ExcelReaders.ExcelErrorCell) - @test isa(df[11][2], ExcelReaders.ExcelErrorCell) - @test isa(df[11][3], ExcelReaders.ExcelErrorCell) - @test isa(df[11][4], ExcelReaders.ExcelErrorCell) - @test isa(df[12][1][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][2][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][3][], ExcelReaders.ExcelErrorCell) - @test DataValues.isna(df[12][4]) + @test df[11] == [DataValue(), DataValue(), DataValue(), DataValue()] + @test df[12] == [DataValue(), DataValue(), DataValue(), NA] @test df[13] == [NA, 3.4, "HKEJW", NA] + @test DataValues.isna(df[12][4]) good_colnames = [:c1, :c2, :c3, :c4, :c5, :c6, :c7, :c8, :c9, :c10, :c11, :c12, :c13] - df, names = create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=false, colnames=good_colnames)) + df, names = create_columns_from_iterabletable(load(filename, "Sheet1", "C:O"; first_row=4, header=false, column_labels=good_colnames)) @test names == good_colnames @test length(df[1]) == 4 @test length(df) == 13 @@ -90,15 +94,10 @@ using Test @test df[8] == [NA, true, NA, false] @test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15, 2, 0)] @test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19, 0, 0), NA] - @test isa(df[11][1], ExcelReaders.ExcelErrorCell) - @test isa(df[11][2], ExcelReaders.ExcelErrorCell) - @test isa(df[11][3], ExcelReaders.ExcelErrorCell) - @test isa(df[11][4], ExcelReaders.ExcelErrorCell) - @test isa(df[12][1][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][2][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][3][], ExcelReaders.ExcelErrorCell) - @test DataValues.isna(df[12][4]) + @test df[11] == [DataValue(), DataValue(), DataValue(), DataValue()] + @test df[12] == [DataValue(), DataValue(), DataValue(), NA] @test df[13] == [NA, 3.4, "HKEJW", NA] + @test DataValues.isna(df[12][4]) # Test for saving DataFrame to XLSX input = (Day = ["Nov. 27","Nov. 28","Nov. 29"], Highest = [78,79,75]) |> DataFrame @@ -114,7 +113,7 @@ using Test @test input == output rm("file.xlsx") - df, names = create_columns_from_iterabletable(load(filename, "Sheet1", colnames=good_colnames)) + df, names = create_columns_from_iterabletable(load(filename, "Sheet1"; column_labels=good_colnames)) @test names == good_colnames @test length(df[1]) == 4 @test length(df) == 13 @@ -128,22 +127,25 @@ using Test @test df[8] == [NA, true, NA, false] @test df[9] == [Date(2015, 3, 3), DateTime(2015, 2, 4, 10, 14), DateTime(1988, 4, 9), Dates.Time(15, 2, 0)] @test df[10] == [Date(1965, 4, 3), DateTime(1950, 8, 9, 18, 40), Dates.Time(19, 0, 0), NA] - @test isa(df[11][1], ExcelReaders.ExcelErrorCell) - @test isa(df[11][2], ExcelReaders.ExcelErrorCell) - @test isa(df[11][3], ExcelReaders.ExcelErrorCell) - @test isa(df[11][4], ExcelReaders.ExcelErrorCell) - @test isa(df[12][1][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][2][], ExcelReaders.ExcelErrorCell) - @test isa(df[12][3][], ExcelReaders.ExcelErrorCell) - @test DataValues.isna(df[12][4]) + @test df[11] == [DataValue(), DataValue(), DataValue(), DataValue()] + @test df[12] == [DataValue(), DataValue(), DataValue(), NA] @test df[13] == [NA, 3.4, "HKEJW", NA] + @test DataValues.isna(df[12][4]) -# Too few colnames - @test_throws ErrorException create_columns_from_iterabletable(load(filename, "Sheet1!C4:O7", header=true, colnames=[:c1, :c2, :c3, :c4])) +# Too few column labels + # XLSX.jl v0.10.4 + @test_throws AssertionError create_columns_from_iterabletable(load(filename, "Sheet1", "C:O"; header=true, column_labels=[:c1, :c2, :c3, :c4])) + + # XLSX.jl v0.11.0 +# @test_throws XLSX.XLSXError create_columns_from_iterabletable(load(filename, "Sheet1", "C:O"; header=true, column_labels=[:c1, :c2, :c3, :c4])) # Test for constructing DataFrame with empty header cell - data, names = create_columns_from_iterabletable(load(filename, "Sheet2!C5:E7")) - @test names == [:Col1, :x1, :Col3] + data, names = create_columns_from_iterabletable(load(filename, "Sheet2", "C:E")) + @test names == [:Col1, Symbol("#Empty"), :Col3] + + # XLSX.jl v0.11.0. The `normalizenames` keyword not available in 0.10.4 +# data, names = create_columns_from_iterabletable(load(filename, "Sheet2", "C:E"; normalizenames=true)) +# @test names == [:Col1, :_Empty, :Col3] end