Skip to content

Commit 9c21e5f

Browse files
authored
Merge pull request #14 from AlphaQuantJS/dev
fix: dataframe transform methods and tests
2 parents 2802ad8 + b4cc116 commit 9c21e5f

3 files changed

Lines changed: 600 additions & 629 deletions

File tree

src/methods/dataframe/transform/join.js

Lines changed: 162 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -5,210 +5,226 @@
55
*/
66
export const join =
77
() =>
8-
(df, other, options = {}) => {
9-
const {
10-
on = null, // Column(s) to join on
11-
left_on = null, // Left DataFrame column(s) to join on
12-
right_on = null, // Right DataFrame column(s) to join on
13-
how = 'inner', // Join type: 'inner', 'left', 'right', 'outer'
14-
suffix = ['_x', '_y'], // Suffixes for overlapping column names
15-
} = options;
16-
17-
// Validate other DataFrame
18-
if (!other || !other.columns) {
19-
throw new Error('Other DataFrame is required');
20-
}
21-
22-
// Determine join columns
23-
let leftCols, rightCols;
24-
25-
if (on) {
8+
(df, other, options = {}) => {
9+
const {
10+
on = null, // Column(s) to join on
11+
leftOn = null, // Left DataFrame column(s) to join on
12+
rightOn = null, // Right DataFrame column(s) to join on
13+
how = 'inner', // Join type: 'inner', 'left', 'right', 'outer'
14+
suffix = ['_x', '_y'], // Suffixes for overlapping column names
15+
} = options;
16+
17+
// Validate other DataFrame
18+
if (!other || !other.columns) {
19+
throw new Error('Other DataFrame is required');
20+
}
21+
22+
// Validate join type
23+
if (!['inner', 'left', 'right', 'outer'].includes(how)) {
24+
throw new Error(
25+
`Invalid join type: ${how}. Must be one of: inner, left, right, outer`,
26+
);
27+
}
28+
29+
// Determine join columns
30+
let leftCols, rightCols;
31+
32+
if (on) {
2633
// Join on same column names in both DataFrames
27-
if (!Array.isArray(on)) {
28-
leftCols = [on];
29-
rightCols = [on];
30-
} else {
31-
leftCols = on;
32-
rightCols = on;
33-
}
34-
} else if (left_on && right_on) {
34+
if (!Array.isArray(on)) {
35+
leftCols = [on];
36+
rightCols = [on];
37+
} else {
38+
leftCols = on;
39+
rightCols = on;
40+
}
41+
} else if (leftOn && rightOn) {
3542
// Join on different column names
36-
if (!Array.isArray(left_on)) {
37-
leftCols = [left_on];
38-
rightCols = [right_on];
39-
} else {
40-
leftCols = left_on;
41-
rightCols = right_on;
42-
}
43+
if (!Array.isArray(leftOn)) {
44+
leftCols = [leftOn];
45+
rightCols = [rightOn];
4346
} else {
44-
throw new Error(
45-
'Join columns must be specified using either "on" or both "left_on" and "right_on"',
46-
);
47+
leftCols = leftOn;
48+
rightCols = rightOn;
4749
}
48-
49-
// Validate join columns
50-
for (const col of leftCols) {
51-
if (!df.columns.includes(col)) {
52-
throw new Error(`Column '${col}' not found in left DataFrame`);
53-
}
50+
} else {
51+
throw new Error(
52+
'Join columns must be specified using either "on" or both "left_on" and "right_on"',
53+
);
54+
}
55+
56+
// Validate join columns
57+
for (const col of leftCols) {
58+
if (!df.columns.includes(col)) {
59+
throw new Error(`Column '${col}' not found in left DataFrame`);
5460
}
61+
}
5562

56-
for (const col of rightCols) {
57-
if (!other.columns.includes(col)) {
58-
throw new Error(`Column '${col}' not found in right DataFrame`);
59-
}
63+
for (const col of rightCols) {
64+
if (!other.columns.includes(col)) {
65+
throw new Error(`Column '${col}' not found in right DataFrame`);
6066
}
67+
}
6168

62-
// Get rows from both DataFrames
63-
const leftRows = df.toArray();
64-
const rightRows = other.toArray();
69+
// Get rows from both DataFrames
70+
const leftRows = df.toArray();
71+
const rightRows = other.toArray();
6572

66-
// Create a map of right rows by join key
67-
const rightMap = new Map();
73+
// Create a map of right rows by join key
74+
const rightMap = new Map();
6875

69-
for (const row of rightRows) {
70-
const key = rightCols.map((col) => row[col]).join('|');
71-
if (!rightMap.has(key)) {
72-
rightMap.set(key, []);
73-
}
74-
rightMap.get(key).push(row);
76+
for (const row of rightRows) {
77+
const key = rightCols.map((col) => row[col]).join('|');
78+
if (!rightMap.has(key)) {
79+
rightMap.set(key, []);
7580
}
81+
rightMap.get(key).push(row);
82+
}
7683

77-
// Perform the join
78-
const joinedRows = [];
84+
// Perform the join
85+
const joinedRows = [];
7986

80-
// Set of columns in the result DataFrame
81-
const resultColumns = new Set();
87+
// Set of columns in the result DataFrame
88+
const resultColumns = new Set();
8289

83-
// Add all columns from left DataFrame
84-
for (const col of df.columns) {
85-
resultColumns.add(col);
86-
}
90+
// Add all columns from left DataFrame
91+
for (const col of df.columns) {
92+
resultColumns.add(col);
93+
}
8794

88-
// Add columns from right DataFrame with suffixes for overlapping names
89-
for (const col of other.columns) {
90-
if (df.columns.includes(col) && !leftCols.includes(col)) {
95+
// Add columns from right DataFrame with suffixes for overlapping names
96+
for (const col of other.columns) {
97+
if (df.columns.includes(col) && !leftCols.includes(col)) {
9198
// Column exists in both DataFrames, add suffix
92-
resultColumns.add(`${col}${suffix[1]}`);
93-
} else if (
94-
!rightCols.includes(col) ||
99+
resultColumns.add(`${col}${suffix[1]}`);
100+
} else if (
101+
!rightCols.includes(col) ||
95102
!leftCols.includes(rightCols[rightCols.indexOf(col)])
96-
) {
103+
) {
97104
// Column only exists in right DataFrame or is not a join column
98-
resultColumns.add(col);
99-
}
105+
resultColumns.add(col);
100106
}
107+
}
101108

102-
// Inner join or left part of outer join
103-
for (const leftRow of leftRows) {
104-
const key = leftCols.map((col) => leftRow[col]).join('|');
105-
const matchingRightRows = rightMap.get(key) || [];
109+
// Inner join or left part of outer join
110+
for (const leftRow of leftRows) {
111+
const key = leftCols.map((col) => leftRow[col]).join('|');
112+
const matchingRightRows = rightMap.get(key) || [];
106113

107-
if (matchingRightRows.length > 0) {
114+
if (matchingRightRows.length > 0) {
108115
// Match found, create joined rows
109-
for (const rightRow of matchingRightRows) {
110-
const joinedRow = { ...leftRow };
111-
112-
// Add columns from right row
113-
for (const col of other.columns) {
114-
if (df.columns.includes(col) && !leftCols.includes(col)) {
115-
// Column exists in both DataFrames, add suffix
116-
joinedRow[`${col}${suffix[1]}`] = rightRow[col];
117-
// Rename left column if needed
118-
if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) {
119-
joinedRow[`${col}${suffix[0]}`] = leftRow[col];
120-
delete joinedRow[col];
121-
}
122-
} else if (
123-
!rightCols.includes(col) ||
124-
!leftCols.includes(rightCols[rightCols.indexOf(col)])
125-
) {
126-
// Column only exists in right DataFrame or is not a join column
127-
joinedRow[col] = rightRow[col];
128-
}
129-
}
130-
131-
joinedRows.push(joinedRow);
132-
}
133-
} else if (how === 'left' || how === 'outer') {
134-
// No match but include in left join or outer join
116+
for (const rightRow of matchingRightRows) {
135117
const joinedRow = { ...leftRow };
136118

137-
// Add null values for right columns
119+
// Add columns from right row
138120
for (const col of other.columns) {
139121
if (df.columns.includes(col) && !leftCols.includes(col)) {
140-
// Column exists in both DataFrames, add suffix
141-
joinedRow[`${col}${suffix[1]}`] = null;
122+
// Column exists in both DataFrames, add suffix
123+
joinedRow[`${col}${suffix[1]}`] = rightRow[col];
142124
// Rename left column if needed
143125
if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) {
144126
joinedRow[`${col}${suffix[0]}`] = leftRow[col];
145127
delete joinedRow[col];
146128
}
147129
} else if (
148130
!rightCols.includes(col) ||
149-
!leftCols.includes(rightCols[rightCols.indexOf(col)])
131+
!leftCols.includes(rightCols[rightCols.indexOf(col)])
150132
) {
151-
// Column only exists in right DataFrame or is not a join column
152-
joinedRow[col] = null;
133+
// Column only exists in right DataFrame or is not a join column
134+
joinedRow[col] = rightRow[col];
153135
}
154136
}
155137

156138
joinedRows.push(joinedRow);
157139
}
140+
} else if (how === 'left' || how === 'outer') {
141+
// No match but include in left join or outer join
142+
const joinedRow = { ...leftRow };
143+
144+
// Add null values for right columns
145+
for (const col of other.columns) {
146+
if (df.columns.includes(col) && !leftCols.includes(col)) {
147+
// Column exists in both DataFrames, add suffix
148+
// Use NaN for numeric columns, null for others
149+
const colType = typeof rightRows[0]?.[col];
150+
joinedRow[`${col}${suffix[1]}`] = colType === 'number' ? NaN : null;
151+
// Rename left column if needed
152+
if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) {
153+
joinedRow[`${col}${suffix[0]}`] = leftRow[col];
154+
delete joinedRow[col];
155+
}
156+
} else if (
157+
!rightCols.includes(col) ||
158+
!leftCols.includes(rightCols[rightCols.indexOf(col)])
159+
) {
160+
// Column only exists in right DataFrame or is not a join column
161+
// Use NaN for numeric columns, null for others
162+
const colType = typeof rightRows[0]?.[col];
163+
joinedRow[col] = colType === 'number' ? NaN : null;
164+
}
165+
}
166+
167+
joinedRows.push(joinedRow);
158168
}
169+
}
159170

160-
// Right join or right part of outer join
161-
if (how === 'right' || how === 'outer') {
171+
// Right join or right part of outer join
172+
if (how === 'right' || how === 'outer') {
162173
// Create a set of keys from left rows
163-
const leftKeys = new Set(
164-
leftRows.map((row) => leftCols.map((col) => row[col]).join('|')),
165-
);
174+
const leftKeys = new Set(
175+
leftRows.map((row) => leftCols.map((col) => row[col]).join('|')),
176+
);
166177

167-
// Add right rows that don't have a match in left
168-
for (const rightRow of rightRows) {
169-
const key = rightCols.map((col) => rightRow[col]).join('|');
178+
// Add right rows that don't have a match in left
179+
for (const rightRow of rightRows) {
180+
const key = rightCols.map((col) => rightRow[col]).join('|');
170181

171-
if (!leftKeys.has(key)) {
172-
const joinedRow = {};
182+
if (!leftKeys.has(key)) {
183+
const joinedRow = {};
173184

174-
// Add null values for left columns
175-
for (const col of df.columns) {
176-
if (other.columns.includes(col) && !rightCols.includes(col)) {
185+
// Add null values for left columns
186+
for (const col of df.columns) {
187+
if (other.columns.includes(col) && !rightCols.includes(col)) {
177188
// Column exists in both DataFrames, add suffix
178-
joinedRow[`${col}${suffix[0]}`] = null;
179-
} else if (
180-
!leftCols.includes(col) ||
189+
// Use NaN for numeric columns, null for others
190+
const colType = typeof leftRows[0]?.[col];
191+
joinedRow[`${col}${suffix[0]}`] =
192+
colType === 'number' ? NaN : null;
193+
} else if (
194+
!leftCols.includes(col) ||
181195
!rightCols.includes(leftCols[leftCols.indexOf(col)])
182-
) {
196+
) {
183197
// Column only exists in left DataFrame or is not a join column
184-
joinedRow[col] = null;
185-
}
198+
// Use NaN for numeric columns, null for others
199+
const colType = typeof leftRows[0]?.[col];
200+
joinedRow[col] = colType === 'number' ? NaN : null;
186201
}
202+
}
187203

188-
// Add values from right row
189-
for (const col of other.columns) {
190-
if (df.columns.includes(col) && !rightCols.includes(col)) {
204+
// Add values from right row
205+
for (const col of other.columns) {
206+
if (df.columns.includes(col) && !rightCols.includes(col)) {
191207
// Column exists in both DataFrames, add suffix
192-
joinedRow[`${col}${suffix[1]}`] = rightRow[col];
193-
} else if (
194-
!rightCols.includes(col) ||
208+
joinedRow[`${col}${suffix[1]}`] = rightRow[col];
209+
} else if (
210+
!rightCols.includes(col) ||
195211
!leftCols.includes(rightCols[rightCols.indexOf(col)])
196-
) {
212+
) {
197213
// Column only exists in right DataFrame or is not a join column
198-
joinedRow[col] = rightRow[col];
199-
} else {
214+
joinedRow[col] = rightRow[col];
215+
} else {
200216
// Join column
201-
joinedRow[col] = rightRow[col];
202-
}
217+
joinedRow[col] = rightRow[col];
203218
}
204-
205-
joinedRows.push(joinedRow);
206219
}
220+
221+
joinedRows.push(joinedRow);
207222
}
208223
}
224+
}
209225

210-
// Create a new DataFrame from joined rows
211-
return new df.constructor.fromRows(joinedRows);
212-
};
226+
// Create a new DataFrame from joined rows
227+
return df.constructor.fromRows(joinedRows);
228+
};
213229

214230
export default { join };

0 commit comments

Comments
 (0)