Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
StructTypes = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
SumTypes = "8e1ec7a9-0e02-4297-b0fe-6433085c89f2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Expand All @@ -22,7 +21,6 @@ AutoHashEquals = "2.2"
Compat = "3.42, 4"
DataStructures = "0.18.14"
PooledArrays = "1.4"
ScopedValues = "1.3"
StructTypes = "1.10"
SumTypes = "0.4, 0.5"
Tables = "1"
Expand Down
18 changes: 8 additions & 10 deletions src/Core.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
using ScopedValues: with

import .NestedIterators: RawNestedIterator
import .ColumnSetManagers: ColumnSet, cycle_columns_to_length!, repeat_each_column!, get_first_key,
get_total_length, column_length, set_length!, free_column_set!, build_final_column_set
Expand Down Expand Up @@ -43,14 +41,14 @@ function expand(data, column_definitions=nothing;
typed_column_style = get_column_style(column_style)

if use_v2 && isnothing(column_definitions)
return with(ExpandNestedData2.DEFAULT_MISSING => default_value) do
ExpandNestedData2.expand(data;
pool_arrays=pool_arrays,
lazy_columns=lazy_columns,
name_join_pattern=name_join_pattern,
column_style=column_style
)
end
return ExpandNestedData2.expand(data;
default_value=default_value,
pool_arrays=pool_arrays,
lazy_columns=lazy_columns,
name_join_pattern=name_join_pattern,
column_style=column_style,
column_names=column_names
)
end

csm = ColumnSetManager()
Expand Down
134 changes: 90 additions & 44 deletions src/ExpandNestedData2.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
module ExpandNestedData2
using Accessors: @set
using ScopedValues: ScopedValue, with
using AutoHashEquals: @auto_hash_equals
using SumTypes: @sum_type, @cases
using StructTypes: StructTypes
using PooledArrays: PooledArray
using TypedTables: FlexTable

using Base.Iterators: repeated, flatmap
const ifilter = Iterators.filter
const imap = Iterators.map
const izip = Iterators.zip

NameValueContainer = Union{StructTypes.DictType, StructTypes.DataType}
Container = Union{NameValueContainer, StructTypes.ArrayType}
Expand All @@ -16,7 +18,31 @@ is_container(t) = typeof(StructTypes.StructType(t)) <: Container
@enum ColumnStyle flat_columns nested_columns
@enum PoolArrayOptions NEVER ALWAYS AUTO

const DEFAULT_MISSING = ScopedValue(missing)
struct CustomMissingValue end
const MISSING = CustomMissingValue()

macro get(dict, key, default)
quote
let v = get($(esc(dict)), $(esc(key)), $MISSING)
if v === $MISSING
$(esc(default))
else
v
end
end
end
end

macro getproperty(obj, key, default)
quote
if hasproperty($obj, $key)
getproperty($obj, $key)
else
$default
end
end
end


include("PathGraph2.jl")

Expand All @@ -41,6 +67,7 @@ NamePart(;name=nothing) = NamePart(name)
parts::Vector{NamePart}
end
NamePath() = NamePath(NamePart[])
NamePath(parts...) = NamePath([NamePart(part) for part in parts])

function append(np::NamePath, name)
new_parts = copy(np.parts)
Expand Down Expand Up @@ -116,12 +143,11 @@ function get_all_seeds(ic::IterCapture, up_to::Int=64)
end



Base.size(ic::IterCapture) = (length(ic),)
seed(data::T) where T = IterCapture'.Seed{T}(data)
function seed_vector(@nospecialize(data))
function seed_vector(@nospecialize(data), default_value)
if length(data) == 0
return seed(DEFAULT_MISSING[])
return seed(default_value)
elseif length(data) == 1
return seed(only(data))
elseif data isa Vector
Expand All @@ -138,7 +164,7 @@ function concat(ics)
final_indices = accumulate(+, length.(ics))
children = Pair{Int,IterCapture}[
i => ic
for (i, ic) in Iterators.zip(final_indices, ics)
for (i, ic) in izip(final_indices, ics)
]
len = last(final_indices)
res::IterCapture{T} = IterCapture'.Concat(len, n, children)
Expand Down Expand Up @@ -186,63 +212,84 @@ function get_name_path(c::Column)
end

function expand(data;
default_value=missing,
pool_arrays=false,
lazy_columns=false,
name_join_pattern="_",
column_style=:flat
column_style=:flat,
lazy_columns = false,
column_names = ()
)
col_set = _expand(data, NamePath())
col_set = _expand(data, NamePath(), default_value)

if column_style == :flat
# TODO make this a FlexTable before 2.0
return (;
(
join_name_path(c.name, name_join_pattern) => lazy_columns ? c.data : collect(c; pool_arrays=pool_arrays)
for c in col_set
)...
column_name_lookup = Dict(
NamePath(parts...) => replacement
for (parts, replacement) in column_names
)

final_pairs = (
get_flattened_name_column_pair(c, column_name_lookup;
pool_arrays,
name_join_pattern,
lazy_columns
)
for c in col_set
)

# TODO make this a FlexTable before 2.0
return (; final_pairs...)
end

name_paths = get_name_path.(col_set)
path_graph = make_path_graph(name_paths)
make_nested_table(col_set, path_graph)
end

function get_flattened_name_column_pair(column, column_name_lookup; pool_arrays, name_join_pattern, lazy_columns)
name = @get(column_name_lookup, column.name, join_name_path(column.name, name_join_pattern))
data = if lazy_columns
column.data
else
collect(column; pool_arrays=pool_arrays)
end
return name => data
end

function join_name_path(np::NamePath, join_pattern)
parts = Iterators.map(p -> string(p.name), np.parts)
parts = imap(p -> string(p.name), np.parts)
joined = join(parts, join_pattern)
return Symbol(joined)
end

function _expand(@nospecialize(data), name_path)
function _expand(@nospecialize(data), name_path, default_value)
T = typeof(data)
StructT = typeof(StructTypes.StructType(T))
if StructT <: StructTypes.DictType
return _expand_dict(data, name_path)
return _expand_dict(data, name_path, default_value)
elseif StructT <: StructTypes.DataType
return _expand_data_type(data, name_path)
return _expand_data_type(data, name_path, default_value)
elseif StructT <: StructTypes.ArrayType
return _expand_array(data, name_path)
return _expand_array(data, name_path, default_value)
else
return _expand_leaf(data, name_path)
end
end

function _expand_dict(@nospecialize(data), name_path::NamePath)
return _expand_name_value_container(data, keys(data), getindex, name_path)
function _expand_dict(@nospecialize(data), name_path::NamePath, default_value)
return _expand_name_value_container(data, keys(data), getindex, name_path, default_value)
end

function _expand_data_type(@nospecialize(data), name_path::NamePath)
return _expand_name_value_container(data, propertynames(data), getproperty, name_path)
function _expand_data_type(@nospecialize(data), name_path::NamePath, default_value)
return _expand_name_value_container(data, propertynames(data), getproperty, name_path, default_value)
end

function _expand_name_value_container(@nospecialize(data), @nospecialize(names), getter, name_path::NamePath)
function _expand_name_value_container(@nospecialize(data), @nospecialize(names), getter, name_path::NamePath, default_value)
if length(names) == 0
return Column[]
end

list_of_column_sets = Vector{Column}[
_expand(getter(data, name), append(name_path, name))
_expand(getter(data, name), append(name_path, name), default_value)
for name in names
]
return merge_columns!(list_of_column_sets)
Expand Down Expand Up @@ -284,49 +331,48 @@ function Base.vcat(columns::Column...)
)
end

function _expand_array(@nospecialize(data), name_path)
function _expand_array(@nospecialize(data), name_path, default_value)
element_count = length(data)

if element_count == 0
return Column[Column(name_path, seed(DEFAULT_MISSING[]))]
return Column[Column(name_path, seed(default_value))]
elseif element_count == 1
return _expand(only(data), name_path)
return _expand(only(data), name_path, default_value)
end

container_count = sum(is_container, data)

# No containers at all
if container_count == 0
return Column[Column(name_path, seed_vector(data))]
return Column[Column(name_path, seed_vector(data, default_value))]
end

containers = Iterators.filter(is_container, data)
expanded = Iterators.map(_expand, containers, Iterators.repeated(name_path))
no_empties = collect(Iterators.filter(!isempty, expanded))
all_names = Set(Iterators.flatmap(c -> (x.name for x in c), no_empties))
stacked_columns = Column[stack_columns(no_empties, name) for name in all_names]
containers = ifilter(is_container, data)
expanded = imap(_expand, containers, repeated(name_path), repeated(default_value))
no_empties = collect(ifilter(!isempty, expanded))
all_names = Set(flatmap(c -> (x.name for x in c), no_empties))
stacked_columns = Column[stack_columns(no_empties, name, default_value) for name in all_names]

if container_count != element_count
loose_values = filter(!is_container, data)
loose_columns = Column[Column(name_path, seed_vector(loose_values))]
loose_columns = Column[Column(name_path, seed_vector(loose_values, default_value))]
return merge_columns!((loose_columns, stacked_columns))
end
return stacked_columns
end

function stack_columns(column_sets, name)
data = concat(map(c -> get_column(c, name).data, column_sets))
function stack_columns(column_sets, name, default_value)
data = concat(map(c -> get_column(c, name, default_value).data, column_sets))
Column(name, data)
end



function get_column(column_set, name_path)
function get_column(column_set, name_path, default_value)
len = length(column_set[1])
i = findfirst(c -> c.name == name_path, column_set)
if isnothing(i)
x = DEFAULT_MISSING[]
return cycle_column(Column(name_path, seed(x)), len)
return cycle_column(Column(name_path, seed(default_value)), len)
end
return column_set[i]
end
Expand All @@ -339,8 +385,8 @@ end
function make_nested_table(column_set, path_graph::PathNode, name_path::NamePath=NamePath())
return @cases path_graph begin
[TopLevelNode, BranchNode] => table_from_children(column_set, path_graph, name_path)
LeafNode(name, _, pool_arrays, _) => collect(
get_column(column_set, name_path); pool_arrays=pool_arrays)
LeafNode(name, default_value, pool_arrays, _) => collect(
get_column(column_set, name_path, default_value); pool_arrays=pool_arrays)
end
end

Expand Down
5 changes: 1 addition & 4 deletions src/PathGraph2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ function Node(name, children)
end
function LeafNode(name_path)
name = last(name_path)
default_value = get_default_value(name_path)
default_value = nothing#get_default_value(name_path)
pool_arrays = get_pool_arrays(name_path)
dtype = get_dtype(name_path)
return PathNode'.LeafNode(name, default_value, pool_arrays, dtype)
Expand All @@ -47,9 +47,6 @@ function get_default_value(node::PathNode)
[BranchNode, TopLevelNode] => error("Can't access default value for non-leaf node")
end
end
function get_default_value(::Any)
return DEFAULT_MISSING[]
end
function get_pool_arrays(node::PathNode)
return @cases node begin
LeafNode(_, _, _, pool_arrays) => pool_arrays
Expand Down
Loading
Loading