admin管理员组

文章数量:1122826

Need to parse nested XML in Snowflake. Example:


    WITH xml_table
    AS (
        SELECT 1 AS ID
            , PARSE_XML(
                '<root>
                    <docs>
                        <doc>
                            <Id>1</Id>
                            <Name>
                                <Kvp>
                                    <Key>k1</Key>
                                    <Value>v1</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k2</Key>
                                    <Value>v2</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k3</Key>
                                    <Value>v3</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>2</Id>
                            <Name>
                                <Kvp>
                                    <Key>k4</Key>
                                    <Value>v4</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k5</Key>
                                    <Value>v5</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>3</Id>
                            <Name>
                                <Kvp>
                                    <Key>k6</Key>
                                    <Value>v6</Value>
                                </Kvp>
                            </Name>
                        </doc>
                    </docs>
                </root>'
            ) AS XML_COL
        )
    SELECT docs.ID
        , docs.docId
        --, docs.docName
        , GET(XMLGET(kvps.VALUE, 'Key'), '$')::STRING AS docNameKey
        , GET(XMLGET(kvps.VALUE, 'Value'), '$')::STRING AS docNameValue
        --, kvps.*
    FROM --docs
        (
        SELECT xml_table.ID
            , xml_table.XML_COL
            , GET(xml_table.XML_COL, '@')::STRING AS ROOT_NODE_NAME
            , XMLGET(docs.VALUE, 'Id') : "$"::STRING AS docId
            , XMLGET(docs.VALUE, 'Name') AS docName
        FROM xml_table
            , LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
        WHERE 1 = 1
        ) docs
        , LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
    WHERE 1 = 1

So i need to parse it to table format:

    doc.Id AS docId
    doc.Name.Kvp.Key AS docNameKey
    doc.Name.Kvp.Value AS docNameValue

The given example seesm to work as expected when there is more than one key-value-pair under KVP(like the docId IN (1,2) in the example), but then when there is one key-value-pair then the result is not as expected. Why?

The query returns:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   null    null
1   3   null    null
1   3   null    null
1   3   null    null

As you see the docId=3 has been "blown" to 4 records and no key-value-pair is found. But it has one key value pair k6-v6.

So the expected output (what i would like to get) is:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   k6  v6

Why does the xml parsing depend on the number of sub-elements?

Need to parse nested XML in Snowflake. Example:


    WITH xml_table
    AS (
        SELECT 1 AS ID
            , PARSE_XML(
                '<root>
                    <docs>
                        <doc>
                            <Id>1</Id>
                            <Name>
                                <Kvp>
                                    <Key>k1</Key>
                                    <Value>v1</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k2</Key>
                                    <Value>v2</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k3</Key>
                                    <Value>v3</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>2</Id>
                            <Name>
                                <Kvp>
                                    <Key>k4</Key>
                                    <Value>v4</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k5</Key>
                                    <Value>v5</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>3</Id>
                            <Name>
                                <Kvp>
                                    <Key>k6</Key>
                                    <Value>v6</Value>
                                </Kvp>
                            </Name>
                        </doc>
                    </docs>
                </root>'
            ) AS XML_COL
        )
    SELECT docs.ID
        , docs.docId
        --, docs.docName
        , GET(XMLGET(kvps.VALUE, 'Key'), '$')::STRING AS docNameKey
        , GET(XMLGET(kvps.VALUE, 'Value'), '$')::STRING AS docNameValue
        --, kvps.*
    FROM --docs
        (
        SELECT xml_table.ID
            , xml_table.XML_COL
            , GET(xml_table.XML_COL, '@')::STRING AS ROOT_NODE_NAME
            , XMLGET(docs.VALUE, 'Id') : "$"::STRING AS docId
            , XMLGET(docs.VALUE, 'Name') AS docName
        FROM xml_table
            , LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
        WHERE 1 = 1
        ) docs
        , LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
    WHERE 1 = 1

So i need to parse it to table format:

    doc.Id AS docId
    doc.Name.Kvp.Key AS docNameKey
    doc.Name.Kvp.Value AS docNameValue

The given example seesm to work as expected when there is more than one key-value-pair under KVP(like the docId IN (1,2) in the example), but then when there is one key-value-pair then the result is not as expected. Why?

The query returns:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   null    null
1   3   null    null
1   3   null    null
1   3   null    null

As you see the docId=3 has been "blown" to 4 records and no key-value-pair is found. But it has one key value pair k6-v6.

So the expected output (what i would like to get) is:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   k6  v6

Why does the xml parsing depend on the number of sub-elements?

Share Improve this question edited Nov 25, 2024 at 7:54 user12761950 asked Nov 22, 2024 at 8:20 user12761950user12761950 1491 gold badge2 silver badges10 bronze badges 2
  • Can you share expected output based on input, so that we can test – samhita Commented Nov 24, 2024 at 10:05
  • @samhita updated the original question with the result i get and what i would like to get. The docId=3 which has one key-value-pair is parsed differently than the other ones which have more sub elements. Is this expected behaviour? – user12761950 Commented Nov 25, 2024 at 7:52
Add a comment  | 

1 Answer 1

Reset to default 0

The way I solved it currently:

WITH xml_table
AS (
    SELECT 1 AS ID
        , PARSE_XML(
            '<root>
                    <docs>
                        <doc>
                            <Id>1</Id>
                            <Name>
                                <Kvp>
                                    <Key>k1</Key>
                                    <Value>v1</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k2</Key>
                                    <Value>v2</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k3</Key>
                                    <Value>v3</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>2</Id>
                            <Name>
                                <Kvp>
                                    <Key>k4</Key>
                                    <Value>v4</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k5</Key>
                                    <Value>v5</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>3</Id>
                            <Name>
                                <Kvp>
                                    <Key>k6</Key>
                                    <Value>v6</Value>
                                </Kvp>
                            </Name>
                        </doc>
                    </docs>
                </root>'
        ) AS XML_COL
    )
SELECT docs.ID
    , docs.docId
    , CASE 
        WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
            THEN CAST(GET(XMLGET(kvps.THIS, 'Key'), '$') AS STRING)
        ELSE CAST(GET(XMLGET(kvps.VALUE, 'Key'), '$') AS STRING)
        END AS docNameKey
    , CASE 
        WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
            THEN CAST(GET(XMLGET(kvps.THIS, 'Value'), '$') AS STRING)
        ELSE CAST(GET(XMLGET(kvps.VALUE, 'Value'), '$') AS STRING)
        END AS docNameValue
FROM --docs
    (
    SELECT xml_table.ID
        , xml_table.XML_COL
        , CAST(GET(xml_table.XML_COL, '@') AS STRING) AS ROOT_NODE_NAME
        , CAST(GET(XMLGET(docs.VALUE, 'Id'), '$') AS STRING) AS docId
        , XMLGET(docs.VALUE, 'Name') AS docName
    FROM xml_table
        , LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
    WHERE 1 = 1
    ) docs
    , LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
WHERE 1 = 1
    AND (
        kvps.KEY IS NULL
        OR kvps.KEY = '$'
        ) --For some reason Snowflake works differently if there is 1 sub element than when multiple

So added the WHERE clause condition to exclude the "extra" lines Snowflake adds when there is only one sub set of value pairs. And then in SELECT clause extra CASE statements to pull the values either from THIS or VALUE attributes depending on is there one or many value pairs.

本文标签: xml parsingParse nested XML with key value pairs in SnowflakeStack Overflow