Skip to content

Commit c57bdd1

Browse files
[Feature][Transform-V2] Support jsonpath extracting multiple fields in a configuration block (#9712)
1 parent e3e1c67 commit c57bdd1

File tree

6 files changed

+677
-40
lines changed

6 files changed

+677
-40
lines changed

docs/en/transform-v2/jsonpath.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,27 @@ transform {
161161
}
162162
```
163163

164+
The same result can be achieved with much simpler configuration using batch field extraction with array format:
165+
166+
```hocon
167+
transform {
168+
JsonPath {
169+
plugin_input = "fake"
170+
plugin_output = "fake1"
171+
columns = [
172+
{
173+
"src_field" = "data"
174+
"path" = ["$.data.c_string", "$.data.c_boolean", "$.data.c_integer", "$.data.c_float", "$.data.c_double", "$.data.c_decimal", "$.data.c_date", "$.data.c_datetime", "$.data.c_array", "$.data.c_map_array"]
175+
"dest_field" = ["c1_string", "c1_boolean", "c1_integer", "c1_float", "c1_double", "c1_decimal", "c1_date", "c1_datetime", "c1_array", "c1_map_array"]
176+
"dest_type" = ["string", "boolean", "int", "float", "double", "decimal(4,2)", "date", "time", "array<string>", "array<map<string, string>>"]
177+
}
178+
]
179+
}
180+
}
181+
```
182+
183+
**Important:** When using batch field extraction (multiple paths, dest_fields, and dest_types), the `dest_type` parameter is **required** and cannot be omitted. Each extracted field must have a corresponding type specified. The array format provides better readability and is less error-prone than string-based configurations.
184+
164185
Then the data result table `fake1` will like this
165186

166187
| data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array |

docs/zh/transform-v2/jsonpath.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,26 @@ transform {
161161
}
162162
```
163163

164+
使用批量字段提取功能可以用更简洁的数组格式配置实现相同的结果:
165+
166+
```hocon
167+
transform {
168+
JsonPath {
169+
plugin_input = "fake"
170+
plugin_output = "fake1"
171+
columns = [
172+
{
173+
"src_field" = "data"
174+
"path" = ["$.data.c_string", "$.data.c_boolean", "$.data.c_integer", "$.data.c_float", "$.data.c_double", "$.data.c_decimal", "$.data.c_date", "$.data.c_datetime", "$.data.c_array", "$.data.c_map_array"]
175+
"dest_field" = ["c1_string", "c1_boolean", "c1_integer", "c1_float", "c1_double", "c1_decimal", "c1_date", "c1_datetime", "c1_array", "c1_map_array"]
176+
"dest_type" = ["string", "boolean", "int", "float", "double", "decimal(4,2)", "date", "time", "array<string>", "array<map<string, string>>"]
177+
}
178+
]
179+
}
180+
}
181+
```
182+
**重要提示:** 当使用批量字段提取(多个 paths、dest_fields 和 dest_types)时,`dest_type` 参数是必填的,不能省略。每个提取的字段都必须指定一个对应的类型。数组格式提供了更好的可读性,比基于字符串的配置更不容易出错。
183+
164184
那么数据结果表 `fake1` 将会像这样
165185

166186
| data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array |
@@ -210,6 +230,8 @@ transform {
210230
|------|-----|----------|-------|
211231
| a | 18 | ["a",18] | ... |
212232

233+
234+
213235
## 配置异常数据处理策略
214236

215237
您可以配置 `row_error_handle_way``column_error_handle_way` 来处理异常数据,两者都是非必填项。

seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/java/org/apache/seatunnel/e2e/transform/TestJsonPathTransformIT.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,11 @@ public void testArrayType(TestContainer container) throws Exception {
6666
container.executeJob("/json_path_transform/json_path_array_map.conf");
6767
Assertions.assertEquals(0, execResult.getExitCode());
6868
}
69+
70+
@TestTemplate
71+
public void testBatchFields(TestContainer container) throws Exception {
72+
Container.ExecResult execResult =
73+
container.executeJob("/json_path_transform/json_path_batch_fields_test.conf");
74+
Assertions.assertEquals(0, execResult.getExitCode());
75+
}
6976
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
######
18+
###### This config file tests JsonPath batch fields extraction functionality
19+
######
20+
21+
env {
22+
job.mode = "BATCH"
23+
}
24+
25+
source {
26+
FakeSource {
27+
plugin_output = "fake"
28+
row.num = 100
29+
string.fake.mode = "template"
30+
string.template=["{\"mysql_fields\":{\"id\":{\"v\":1001},\"code\":{\"v\":\"TEST001\"},\"group_code\":{\"v\":\"GROUP001\"},\"user_id\":{\"v\":2001},\"patient_id\":{\"v\":3001},\"doctor_id\":{\"v\":4001},\"price\":{\"v\":99.99},\"status\":{\"v\":1},\"create_time\":{\"v\":\"2023-10-29 10:30:00\"},\"update_time\":{\"v\":\"2023-10-29 11:30:00\"}},\"nested_data\":{\"user\":{\"profile\":{\"name\":\"John\",\"age\":30},\"settings\":{\"theme\":\"dark\",\"lang\":\"en\"}},\"orders\":[{\"id\":101,\"amount\":50.5},{\"id\":102,\"amount\":75.8}]},\"array_fields\":[{\"type\":\"A\",\"value\":100},{\"type\":\"B\",\"value\":200}]}"]
31+
schema = {
32+
fields {
33+
mysql_fields = "string"
34+
}
35+
}
36+
}
37+
}
38+
39+
transform {
40+
JsonPath {
41+
plugin_input = "fake"
42+
plugin_output = "fake1"
43+
columns = [
44+
{
45+
"src_field" = "mysql_fields"
46+
"path" = ["$.mysql_fields.id.v", "$.mysql_fields.code.v", "$.mysql_fields.group_code.v", "$.mysql_fields.user_id.v", "$.mysql_fields.patient_id.v", "$.mysql_fields.doctor_id.v", "$.mysql_fields.price.v", "$.mysql_fields.status.v", "$.mysql_fields.create_time.v", "$.mysql_fields.update_time.v", "$.nested_data.user.profile.name", "$.nested_data.user.profile.age", "$.nested_data.user.settings.theme", "$.nested_data.orders[0].id", "$.nested_data.orders[0].amount", "$.nested_data.orders[1].id", "$.array_fields[0].type", "$.array_fields[0].value", "$.array_fields[1].type", "$.array_fields[1].value"]
47+
"dest_field" = ["id", "code", "group_code", "user_id", "patient_id", "doctor_id", "price", "status", "create_time", "update_time", "user_name", "user_age", "user_theme", "first_order_id", "first_order_amount", "second_order_id", "first_type", "first_value", "second_type", "second_value"]
48+
"dest_type" = ["bigint", "string", "string", "bigint", "bigint", "bigint", "double", "int", "string", "string", "string", "int", "string", "int", "double", "int", "string", "int", "string", "int"]
49+
}
50+
]
51+
}
52+
}
53+
54+
sink {
55+
Assert {
56+
plugin_input = "fake1"
57+
rules =
58+
{
59+
row_rules = [
60+
{
61+
rule_type = MIN_ROW
62+
rule_value = 100
63+
}
64+
],
65+
field_rules = [
66+
{
67+
field_name = id
68+
field_type = bigint
69+
field_value = [
70+
{
71+
rule_type = NOT_NULL
72+
equals_to = 1001
73+
}
74+
]
75+
},
76+
{
77+
field_name = code
78+
field_type = string
79+
field_value = [
80+
{
81+
rule_type = NOT_NULL
82+
equals_to = "TEST001"
83+
}
84+
]
85+
},
86+
{
87+
field_name = group_code
88+
field_type = string
89+
field_value = [
90+
{
91+
rule_type = NOT_NULL
92+
equals_to = "GROUP001"
93+
}
94+
]
95+
},
96+
{
97+
field_name = user_id
98+
field_type = bigint
99+
field_value = [
100+
{
101+
rule_type = NOT_NULL
102+
equals_to = 2001
103+
}
104+
]
105+
},
106+
{
107+
field_name = patient_id
108+
field_type = bigint
109+
field_value = [
110+
{
111+
rule_type = NOT_NULL
112+
equals_to = 3001
113+
}
114+
]
115+
},
116+
{
117+
field_name = doctor_id
118+
field_type = bigint
119+
field_value = [
120+
{
121+
rule_type = NOT_NULL
122+
equals_to = 4001
123+
}
124+
]
125+
},
126+
{
127+
field_name = price
128+
field_type = double
129+
field_value = [
130+
{
131+
rule_type = NOT_NULL
132+
equals_to = 99.99
133+
}
134+
]
135+
},
136+
{
137+
field_name = status
138+
field_type = int
139+
field_value = [
140+
{
141+
rule_type = NOT_NULL
142+
equals_to = 1
143+
}
144+
]
145+
},
146+
{
147+
field_name = create_time
148+
field_type = string
149+
field_value = [
150+
{
151+
rule_type = NOT_NULL
152+
equals_to = "2023-10-29 10:30:00"
153+
}
154+
]
155+
},
156+
{
157+
field_name = update_time
158+
field_type = string
159+
field_value = [
160+
{
161+
rule_type = NOT_NULL
162+
equals_to = "2023-10-29 11:30:00"
163+
}
164+
]
165+
},
166+
{
167+
field_name = user_name
168+
field_type = string
169+
field_value = [
170+
{
171+
rule_type = NOT_NULL
172+
equals_to = "John"
173+
}
174+
]
175+
},
176+
{
177+
field_name = user_age
178+
field_type = int
179+
field_value = [
180+
{
181+
rule_type = NOT_NULL
182+
equals_to = 30
183+
}
184+
]
185+
},
186+
{
187+
field_name = user_theme
188+
field_type = string
189+
field_value = [
190+
{
191+
rule_type = NOT_NULL
192+
equals_to = "dark"
193+
}
194+
]
195+
},
196+
{
197+
field_name = first_order_id
198+
field_type = int
199+
field_value = [
200+
{
201+
rule_type = NOT_NULL
202+
equals_to = 101
203+
}
204+
]
205+
},
206+
{
207+
field_name = first_order_amount
208+
field_type = double
209+
field_value = [
210+
{
211+
rule_type = NOT_NULL
212+
equals_to = 50.5
213+
}
214+
]
215+
},
216+
{
217+
field_name = second_order_id
218+
field_type = int
219+
field_value = [
220+
{
221+
rule_type = NOT_NULL
222+
equals_to = 102
223+
}
224+
]
225+
},
226+
{
227+
field_name = first_type
228+
field_type = string
229+
field_value = [
230+
{
231+
rule_type = NOT_NULL
232+
equals_to = "A"
233+
}
234+
]
235+
},
236+
{
237+
field_name = first_value
238+
field_type = int
239+
field_value = [
240+
{
241+
rule_type = NOT_NULL
242+
equals_to = 100
243+
}
244+
]
245+
},
246+
{
247+
field_name = second_type
248+
field_type = string
249+
field_value = [
250+
{
251+
rule_type = NOT_NULL
252+
equals_to = "B"
253+
}
254+
]
255+
},
256+
{
257+
field_name = second_value
258+
field_type = int
259+
field_value = [
260+
{
261+
rule_type = NOT_NULL
262+
equals_to = 200
263+
}
264+
]
265+
}
266+
]
267+
}
268+
}
269+
}

0 commit comments

Comments
 (0)