|
5 | 5 | */ |
6 | 6 | export const join = |
7 | 7 | () => |
8 | | - (df, other, options = {}) => { |
9 | | - const { |
10 | | - on = null, // Column(s) to join on |
11 | | - left_on = null, // Left DataFrame column(s) to join on |
12 | | - right_on = null, // Right DataFrame column(s) to join on |
13 | | - how = 'inner', // Join type: 'inner', 'left', 'right', 'outer' |
14 | | - suffix = ['_x', '_y'], // Suffixes for overlapping column names |
15 | | - } = options; |
16 | | - |
17 | | - // Validate other DataFrame |
18 | | - if (!other || !other.columns) { |
19 | | - throw new Error('Other DataFrame is required'); |
20 | | - } |
21 | | - |
22 | | - // Determine join columns |
23 | | - let leftCols, rightCols; |
24 | | - |
25 | | - if (on) { |
| 8 | + (df, other, options = {}) => { |
| 9 | + const { |
| 10 | + on = null, // Column(s) to join on |
| 11 | + leftOn = null, // Left DataFrame column(s) to join on |
| 12 | + rightOn = null, // Right DataFrame column(s) to join on |
| 13 | + how = 'inner', // Join type: 'inner', 'left', 'right', 'outer' |
| 14 | + suffix = ['_x', '_y'], // Suffixes for overlapping column names |
| 15 | + } = options; |
| 16 | + |
| 17 | + // Validate other DataFrame |
| 18 | + if (!other || !other.columns) { |
| 19 | + throw new Error('Other DataFrame is required'); |
| 20 | + } |
| 21 | + |
| 22 | + // Validate join type |
| 23 | + if (!['inner', 'left', 'right', 'outer'].includes(how)) { |
| 24 | + throw new Error( |
| 25 | + `Invalid join type: ${how}. Must be one of: inner, left, right, outer`, |
| 26 | + ); |
| 27 | + } |
| 28 | + |
| 29 | + // Determine join columns |
| 30 | + let leftCols, rightCols; |
| 31 | + |
| 32 | + if (on) { |
26 | 33 | // Join on same column names in both DataFrames |
27 | | - if (!Array.isArray(on)) { |
28 | | - leftCols = [on]; |
29 | | - rightCols = [on]; |
30 | | - } else { |
31 | | - leftCols = on; |
32 | | - rightCols = on; |
33 | | - } |
34 | | - } else if (left_on && right_on) { |
| 34 | + if (!Array.isArray(on)) { |
| 35 | + leftCols = [on]; |
| 36 | + rightCols = [on]; |
| 37 | + } else { |
| 38 | + leftCols = on; |
| 39 | + rightCols = on; |
| 40 | + } |
| 41 | + } else if (leftOn && rightOn) { |
35 | 42 | // Join on different column names |
36 | | - if (!Array.isArray(left_on)) { |
37 | | - leftCols = [left_on]; |
38 | | - rightCols = [right_on]; |
39 | | - } else { |
40 | | - leftCols = left_on; |
41 | | - rightCols = right_on; |
42 | | - } |
| 43 | + if (!Array.isArray(leftOn)) { |
| 44 | + leftCols = [leftOn]; |
| 45 | + rightCols = [rightOn]; |
43 | 46 | } else { |
44 | | - throw new Error( |
45 | | - 'Join columns must be specified using either "on" or both "left_on" and "right_on"', |
46 | | - ); |
| 47 | + leftCols = leftOn; |
| 48 | + rightCols = rightOn; |
47 | 49 | } |
48 | | - |
49 | | - // Validate join columns |
50 | | - for (const col of leftCols) { |
51 | | - if (!df.columns.includes(col)) { |
52 | | - throw new Error(`Column '${col}' not found in left DataFrame`); |
53 | | - } |
| 50 | + } else { |
| 51 | + throw new Error( |
| 52 | + 'Join columns must be specified using either "on" or both "left_on" and "right_on"', |
| 53 | + ); |
| 54 | + } |
| 55 | + |
| 56 | + // Validate join columns |
| 57 | + for (const col of leftCols) { |
| 58 | + if (!df.columns.includes(col)) { |
| 59 | + throw new Error(`Column '${col}' not found in left DataFrame`); |
54 | 60 | } |
| 61 | + } |
55 | 62 |
|
56 | | - for (const col of rightCols) { |
57 | | - if (!other.columns.includes(col)) { |
58 | | - throw new Error(`Column '${col}' not found in right DataFrame`); |
59 | | - } |
| 63 | + for (const col of rightCols) { |
| 64 | + if (!other.columns.includes(col)) { |
| 65 | + throw new Error(`Column '${col}' not found in right DataFrame`); |
60 | 66 | } |
| 67 | + } |
61 | 68 |
|
62 | | - // Get rows from both DataFrames |
63 | | - const leftRows = df.toArray(); |
64 | | - const rightRows = other.toArray(); |
| 69 | + // Get rows from both DataFrames |
| 70 | + const leftRows = df.toArray(); |
| 71 | + const rightRows = other.toArray(); |
65 | 72 |
|
66 | | - // Create a map of right rows by join key |
67 | | - const rightMap = new Map(); |
| 73 | + // Create a map of right rows by join key |
| 74 | + const rightMap = new Map(); |
68 | 75 |
|
69 | | - for (const row of rightRows) { |
70 | | - const key = rightCols.map((col) => row[col]).join('|'); |
71 | | - if (!rightMap.has(key)) { |
72 | | - rightMap.set(key, []); |
73 | | - } |
74 | | - rightMap.get(key).push(row); |
| 76 | + for (const row of rightRows) { |
| 77 | + const key = rightCols.map((col) => row[col]).join('|'); |
| 78 | + if (!rightMap.has(key)) { |
| 79 | + rightMap.set(key, []); |
75 | 80 | } |
| 81 | + rightMap.get(key).push(row); |
| 82 | + } |
76 | 83 |
|
77 | | - // Perform the join |
78 | | - const joinedRows = []; |
| 84 | + // Perform the join |
| 85 | + const joinedRows = []; |
79 | 86 |
|
80 | | - // Set of columns in the result DataFrame |
81 | | - const resultColumns = new Set(); |
| 87 | + // Set of columns in the result DataFrame |
| 88 | + const resultColumns = new Set(); |
82 | 89 |
|
83 | | - // Add all columns from left DataFrame |
84 | | - for (const col of df.columns) { |
85 | | - resultColumns.add(col); |
86 | | - } |
| 90 | + // Add all columns from left DataFrame |
| 91 | + for (const col of df.columns) { |
| 92 | + resultColumns.add(col); |
| 93 | + } |
87 | 94 |
|
88 | | - // Add columns from right DataFrame with suffixes for overlapping names |
89 | | - for (const col of other.columns) { |
90 | | - if (df.columns.includes(col) && !leftCols.includes(col)) { |
| 95 | + // Add columns from right DataFrame with suffixes for overlapping names |
| 96 | + for (const col of other.columns) { |
| 97 | + if (df.columns.includes(col) && !leftCols.includes(col)) { |
91 | 98 | // Column exists in both DataFrames, add suffix |
92 | | - resultColumns.add(`${col}${suffix[1]}`); |
93 | | - } else if ( |
94 | | - !rightCols.includes(col) || |
| 99 | + resultColumns.add(`${col}${suffix[1]}`); |
| 100 | + } else if ( |
| 101 | + !rightCols.includes(col) || |
95 | 102 | !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
96 | | - ) { |
| 103 | + ) { |
97 | 104 | // Column only exists in right DataFrame or is not a join column |
98 | | - resultColumns.add(col); |
99 | | - } |
| 105 | + resultColumns.add(col); |
100 | 106 | } |
| 107 | + } |
101 | 108 |
|
102 | | - // Inner join or left part of outer join |
103 | | - for (const leftRow of leftRows) { |
104 | | - const key = leftCols.map((col) => leftRow[col]).join('|'); |
105 | | - const matchingRightRows = rightMap.get(key) || []; |
| 109 | + // Inner join or left part of outer join |
| 110 | + for (const leftRow of leftRows) { |
| 111 | + const key = leftCols.map((col) => leftRow[col]).join('|'); |
| 112 | + const matchingRightRows = rightMap.get(key) || []; |
106 | 113 |
|
107 | | - if (matchingRightRows.length > 0) { |
| 114 | + if (matchingRightRows.length > 0) { |
108 | 115 | // Match found, create joined rows |
109 | | - for (const rightRow of matchingRightRows) { |
110 | | - const joinedRow = { ...leftRow }; |
111 | | - |
112 | | - // Add columns from right row |
113 | | - for (const col of other.columns) { |
114 | | - if (df.columns.includes(col) && !leftCols.includes(col)) { |
115 | | - // Column exists in both DataFrames, add suffix |
116 | | - joinedRow[`${col}${suffix[1]}`] = rightRow[col]; |
117 | | - // Rename left column if needed |
118 | | - if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) { |
119 | | - joinedRow[`${col}${suffix[0]}`] = leftRow[col]; |
120 | | - delete joinedRow[col]; |
121 | | - } |
122 | | - } else if ( |
123 | | - !rightCols.includes(col) || |
124 | | - !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
125 | | - ) { |
126 | | - // Column only exists in right DataFrame or is not a join column |
127 | | - joinedRow[col] = rightRow[col]; |
128 | | - } |
129 | | - } |
130 | | - |
131 | | - joinedRows.push(joinedRow); |
132 | | - } |
133 | | - } else if (how === 'left' || how === 'outer') { |
134 | | - // No match but include in left join or outer join |
| 116 | + for (const rightRow of matchingRightRows) { |
135 | 117 | const joinedRow = { ...leftRow }; |
136 | 118 |
|
137 | | - // Add null values for right columns |
| 119 | + // Add columns from right row |
138 | 120 | for (const col of other.columns) { |
139 | 121 | if (df.columns.includes(col) && !leftCols.includes(col)) { |
140 | | - // Column exists in both DataFrames, add suffix |
141 | | - joinedRow[`${col}${suffix[1]}`] = null; |
| 122 | + // Column exists in both DataFrames, add suffix |
| 123 | + joinedRow[`${col}${suffix[1]}`] = rightRow[col]; |
142 | 124 | // Rename left column if needed |
143 | 125 | if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) { |
144 | 126 | joinedRow[`${col}${suffix[0]}`] = leftRow[col]; |
145 | 127 | delete joinedRow[col]; |
146 | 128 | } |
147 | 129 | } else if ( |
148 | 130 | !rightCols.includes(col) || |
149 | | - !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
| 131 | + !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
150 | 132 | ) { |
151 | | - // Column only exists in right DataFrame or is not a join column |
152 | | - joinedRow[col] = null; |
| 133 | + // Column only exists in right DataFrame or is not a join column |
| 134 | + joinedRow[col] = rightRow[col]; |
153 | 135 | } |
154 | 136 | } |
155 | 137 |
|
156 | 138 | joinedRows.push(joinedRow); |
157 | 139 | } |
| 140 | + } else if (how === 'left' || how === 'outer') { |
| 141 | + // No match but include in left join or outer join |
| 142 | + const joinedRow = { ...leftRow }; |
| 143 | + |
| 144 | + // Add null values for right columns |
| 145 | + for (const col of other.columns) { |
| 146 | + if (df.columns.includes(col) && !leftCols.includes(col)) { |
| 147 | + // Column exists in both DataFrames, add suffix |
| 148 | + // Use NaN for numeric columns, null for others |
| 149 | + const colType = typeof rightRows[0]?.[col]; |
| 150 | + joinedRow[`${col}${suffix[1]}`] = colType === 'number' ? NaN : null; |
| 151 | + // Rename left column if needed |
| 152 | + if (!joinedRow.hasOwnProperty(`${col}${suffix[0]}`)) { |
| 153 | + joinedRow[`${col}${suffix[0]}`] = leftRow[col]; |
| 154 | + delete joinedRow[col]; |
| 155 | + } |
| 156 | + } else if ( |
| 157 | + !rightCols.includes(col) || |
| 158 | + !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
| 159 | + ) { |
| 160 | + // Column only exists in right DataFrame or is not a join column |
| 161 | + // Use NaN for numeric columns, null for others |
| 162 | + const colType = typeof rightRows[0]?.[col]; |
| 163 | + joinedRow[col] = colType === 'number' ? NaN : null; |
| 164 | + } |
| 165 | + } |
| 166 | + |
| 167 | + joinedRows.push(joinedRow); |
158 | 168 | } |
| 169 | + } |
159 | 170 |
|
160 | | - // Right join or right part of outer join |
161 | | - if (how === 'right' || how === 'outer') { |
| 171 | + // Right join or right part of outer join |
| 172 | + if (how === 'right' || how === 'outer') { |
162 | 173 | // Create a set of keys from left rows |
163 | | - const leftKeys = new Set( |
164 | | - leftRows.map((row) => leftCols.map((col) => row[col]).join('|')), |
165 | | - ); |
| 174 | + const leftKeys = new Set( |
| 175 | + leftRows.map((row) => leftCols.map((col) => row[col]).join('|')), |
| 176 | + ); |
166 | 177 |
|
167 | | - // Add right rows that don't have a match in left |
168 | | - for (const rightRow of rightRows) { |
169 | | - const key = rightCols.map((col) => rightRow[col]).join('|'); |
| 178 | + // Add right rows that don't have a match in left |
| 179 | + for (const rightRow of rightRows) { |
| 180 | + const key = rightCols.map((col) => rightRow[col]).join('|'); |
170 | 181 |
|
171 | | - if (!leftKeys.has(key)) { |
172 | | - const joinedRow = {}; |
| 182 | + if (!leftKeys.has(key)) { |
| 183 | + const joinedRow = {}; |
173 | 184 |
|
174 | | - // Add null values for left columns |
175 | | - for (const col of df.columns) { |
176 | | - if (other.columns.includes(col) && !rightCols.includes(col)) { |
| 185 | + // Add null values for left columns |
| 186 | + for (const col of df.columns) { |
| 187 | + if (other.columns.includes(col) && !rightCols.includes(col)) { |
177 | 188 | // Column exists in both DataFrames, add suffix |
178 | | - joinedRow[`${col}${suffix[0]}`] = null; |
179 | | - } else if ( |
180 | | - !leftCols.includes(col) || |
| 189 | + // Use NaN for numeric columns, null for others |
| 190 | + const colType = typeof leftRows[0]?.[col]; |
| 191 | + joinedRow[`${col}${suffix[0]}`] = |
| 192 | + colType === 'number' ? NaN : null; |
| 193 | + } else if ( |
| 194 | + !leftCols.includes(col) || |
181 | 195 | !rightCols.includes(leftCols[leftCols.indexOf(col)]) |
182 | | - ) { |
| 196 | + ) { |
183 | 197 | // Column only exists in left DataFrame or is not a join column |
184 | | - joinedRow[col] = null; |
185 | | - } |
| 198 | + // Use NaN for numeric columns, null for others |
| 199 | + const colType = typeof leftRows[0]?.[col]; |
| 200 | + joinedRow[col] = colType === 'number' ? NaN : null; |
186 | 201 | } |
| 202 | + } |
187 | 203 |
|
188 | | - // Add values from right row |
189 | | - for (const col of other.columns) { |
190 | | - if (df.columns.includes(col) && !rightCols.includes(col)) { |
| 204 | + // Add values from right row |
| 205 | + for (const col of other.columns) { |
| 206 | + if (df.columns.includes(col) && !rightCols.includes(col)) { |
191 | 207 | // Column exists in both DataFrames, add suffix |
192 | | - joinedRow[`${col}${suffix[1]}`] = rightRow[col]; |
193 | | - } else if ( |
194 | | - !rightCols.includes(col) || |
| 208 | + joinedRow[`${col}${suffix[1]}`] = rightRow[col]; |
| 209 | + } else if ( |
| 210 | + !rightCols.includes(col) || |
195 | 211 | !leftCols.includes(rightCols[rightCols.indexOf(col)]) |
196 | | - ) { |
| 212 | + ) { |
197 | 213 | // Column only exists in right DataFrame or is not a join column |
198 | | - joinedRow[col] = rightRow[col]; |
199 | | - } else { |
| 214 | + joinedRow[col] = rightRow[col]; |
| 215 | + } else { |
200 | 216 | // Join column |
201 | | - joinedRow[col] = rightRow[col]; |
202 | | - } |
| 217 | + joinedRow[col] = rightRow[col]; |
203 | 218 | } |
204 | | - |
205 | | - joinedRows.push(joinedRow); |
206 | 219 | } |
| 220 | + |
| 221 | + joinedRows.push(joinedRow); |
207 | 222 | } |
208 | 223 | } |
| 224 | + } |
209 | 225 |
|
210 | | - // Create a new DataFrame from joined rows |
211 | | - return new df.constructor.fromRows(joinedRows); |
212 | | - }; |
| 226 | + // Create a new DataFrame from joined rows |
| 227 | + return df.constructor.fromRows(joinedRows); |
| 228 | + }; |
213 | 229 |
|
214 | 230 | export default { join }; |
0 commit comments