What's the most efficient way to normalize tex

2020-02-06 04:01发布

问题:

In T-SQL I have a column with some text in it with a format like the following:

[Key1:Value1:Value2:Value3:Value4:Value5]
[Key2:Value1:Value2:Value3:Value4:Value5]
[Key3:Value1:Value2:Value3:Value4:Value5]

where there can be any number of bracket sets, but usually between 3 and 6. I'm looking for a way to quickly format them into a temp table or table variable so I can report on the data. For example, I'd want the table format to be:

|Key|Column 1|Column 2|Column 3|Column 4|Column 5|  
|Key 1|Value 1|Value 2|Value 3|Value 4|Value 5|  
|Key 2|Value 1|Value 2|Value 3|Value 4|Value 5|  
|Key 3|Value 1|Value 2|Value 3|Value 4|Value 5|  

I know this is pushing the limits of SQL and should be handled through modification of the application, but I'm hoping there's something clever I can do with T-SQL for now.

回答1:

If you have a maximum number of columns, a little XML within a CROSS APPLY.

If unknown, you would have to go DYNAMIC.

Example

Declare @YourTable Table ([ID] varchar(50),[SomeCol] varchar(50))
Insert Into @YourTable Values 
 (1,'[Key1:Value1:Value2:Value3:Value4:Value5]')
,(2,'[Key2:Value1:Value2:Value3:Value4:Value5]')
,(3,'[Key3:Value1:Value2:Value3:Value4:Value5]')

Select A.ID
      ,B.*
 From  @YourTable A
 Cross Apply (
                Select Pos1 = ltrim(rtrim(xDim.value('/x[1]','varchar(max)')))
                      ,Pos2 = ltrim(rtrim(xDim.value('/x[2]','varchar(max)')))
                      ,Pos3 = ltrim(rtrim(xDim.value('/x[3]','varchar(max)')))
                      ,Pos4 = ltrim(rtrim(xDim.value('/x[4]','varchar(max)')))
                      ,Pos5 = ltrim(rtrim(xDim.value('/x[5]','varchar(max)')))
                      ,Pos6 = ltrim(rtrim(xDim.value('/x[6]','varchar(max)')))
                      ,Pos7 = ltrim(rtrim(xDim.value('/x[7]','varchar(max)')))
                      ,Pos8 = ltrim(rtrim(xDim.value('/x[8]','varchar(max)')))
                      ,Pos9 = ltrim(rtrim(xDim.value('/x[9]','varchar(max)')))
                From  (Select Cast('<x>' + replace(replace(replace(SomeCol,'[',''),']',''),':','</x><x>')+'</x>' as xml) as xDim) as A 
             ) B

Returns

ID  Pos1    Pos2    Pos3    Pos4    Pos5    Pos6    Pos7    Pos8    Pos9
1   Key1    Value1  Value2  Value3  Value4  Value5  NULL    NULL    NULL
2   Key2    Value1  Value2  Value3  Value4  Value5  NULL    NULL    NULL
3   Key3    Value1  Value2  Value3  Value4  Value5  NULL    NULL    NULL

EDIT

I should add, the ltrim(rtrim(...)) is optional and the varchar(max) is my demonstrative default.

EDIT - One String delimited with CRLF

Declare @S varchar(max)='
[Key1:Value1:Value2:Value3:Value4:Value5]
[Key2:Value1:Value2:Value3:Value4:Value5]
[Key3:Value1:Value2:Value3:Value4:Value5]
'

Select B.*
 From  (
        Select RetSeq = Row_Number() over (Order By (Select null))
              ,RetVal = LTrim(RTrim(B.i.value('(./text())[1]', 'varchar(max)')))
        From  (Select x = Cast('<x>' + replace(@S,char(13)+char(10),'</x><x>')+'</x>' as xml).query('.')) as A 
        Cross Apply x.nodes('x') AS B(i)
       ) A
 Cross Apply (
               Select  Pos1 = ltrim(rtrim(xDim.value('/x[1]','varchar(max)')))
                      ,Pos2 = ltrim(rtrim(xDim.value('/x[2]','varchar(max)')))
                      ,Pos3 = ltrim(rtrim(xDim.value('/x[3]','varchar(max)')))
                      ,Pos4 = ltrim(rtrim(xDim.value('/x[4]','varchar(max)')))
                      ,Pos5 = ltrim(rtrim(xDim.value('/x[5]','varchar(max)')))
                      ,Pos6 = ltrim(rtrim(xDim.value('/x[6]','varchar(max)')))
                      ,Pos7 = ltrim(rtrim(xDim.value('/x[7]','varchar(max)')))
                      ,Pos8 = ltrim(rtrim(xDim.value('/x[8]','varchar(max)')))
                      ,Pos9 = ltrim(rtrim(xDim.value('/x[9]','varchar(max)')))
                From  (Select Cast('<x>' + replace(replace(replace(RetVal,'[',''),']',''),':','</x><x>')+'</x>' as xml) as xDim) as A 
       ) B
 Where A.RetVal is not null


回答2:

The fastest way to split a string when you know the maximum number of columns is to use the Cascading CROSS APPLY technique. Let's say you know that their will be no more than 10 items in your string. You could do this:

DECLARE @string varchar(1000) = '[Key1:Value1:Value2:Value3:Value4:Value5]'

SELECT 
  [key] = SUBSTRING(t.string,1,d1.d-1),
  col1  = SUBSTRING(t.string,d1.d+1,d2.d-d1.d-1),
  col2  = SUBSTRING(t.string,d2.d+1,d3.d-d2.d-1),
  col3  = SUBSTRING(t.string,d3.d+1,d4.d-d3.d-1),
  col4  = SUBSTRING(t.string,d4.d+1,d5.d-d4.d-1),
  col5  = SUBSTRING(t.string,d5.d+1,d6.d-d5.d-1),
  col6  = SUBSTRING(t.string,d6.d+1,d7.d-d5.d-1),
  col7  = SUBSTRING(t.string,d7.d+1,d8.d-d5.d-1),
  col8  = SUBSTRING(t.string,d8.d+1,d9.d-d5.d-1),
  col9  = SUBSTRING(t.string,d9.d+1,d10.d-d5.d-1)
FROM (VALUES (REPLACE(REPLACE(@string,']',':'),'[',''))) t(string)
CROSS APPLY (VALUES (CHARINDEX(':',t.string)))                   d1(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d1.d+1),0)))  d2(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d2.d+1),0)))  d3(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d3.d+1),0)))  d4(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d4.d+1),0)))  d5(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d5.d+1),0)))  d6(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d6.d+1),0)))  d7(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d7.d+1),0)))  d8(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d8.d+1),0)))  d9(d)
CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d9.d+1),0)))  d10(d);

To use this technique against a table with the strings stored in rows would be like this:

DECLARE @table TABLE (someid int identity, somestring varchar(1000));
INSERT @table(somestring) VALUES 
('[Key1:Value1:Value2:Value3:Value4:Value5]'),
('[Key2:Value1:Value2:Value3:Value4:Value5]'),
('[Key3:Value1:Value2:Value3:Value4:Value5]'),
('[Key4:Value1:Value2:Value3:Value4:Value5:Value6:Value7:Value8]'),
('[Key5:Value1:Value2:Value3:Value4:Value5:Value6:Value7:Value8:Value9:Value10]');

SELECT * 
FROM @table s
CROSS APPLY
(
  SELECT 
    [key]  = SUBSTRING(t.string,1,d1.d-1),
    dCount = LEN(t.string)-LEN(REPLACE(t.string,':','')),
    col1   = SUBSTRING(t.string,d1.d+1,d2.d-d1.d-1),
    col2   = SUBSTRING(t.string,d2.d+1,d3.d-d2.d-1),
    col3   = SUBSTRING(t.string,d3.d+1,d4.d-d3.d-1),
    col4   = SUBSTRING(t.string,d4.d+1,d5.d-d4.d-1),
    col5   = SUBSTRING(t.string,d5.d+1,d6.d-d5.d-1),
    col6   = SUBSTRING(t.string,d6.d+1,d7.d-d6.d-1),
    col7   = SUBSTRING(t.string,d7.d+1,d8.d-d7.d-1),
    col8   = SUBSTRING(t.string,d8.d+1,d9.d-d8.d-1),
    col9   = SUBSTRING(t.string,d9.d+1,d10.d-d9.d-1)
  FROM (VALUES (REPLACE(REPLACE(s.somestring,']',':'),'[',''))) t(string)
  CROSS APPLY (VALUES (CHARINDEX(':',t.string)))                   d1(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d1.d+1),0)))  d2(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d2.d+1),0)))  d3(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d3.d+1),0)))  d4(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d4.d+1),0)))  d5(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d5.d+1),0)))  d6(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d6.d+1),0)))  d7(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d7.d+1),0)))  d8(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d8.d+1),0)))  d9(d)
  CROSS APPLY (VALUES (NULLIF(CHARINDEX(':',t.string,d9.d+1),0)))  d10(d)
) split
WHERE LEN(s.somestring)-LEN(REPLACE(s.somestring,':','')) < 10

If you don't know the maximum number of possible items you could take this logic and wrap it in some Dynamic SQL that creates the correct number of CROSS APPLY's. I don't have time to put together that logic but, to get the maximum number of possible delimiters you could do something like this:

DECLARE @maxDelimiters tinyint = 
  (SELECT MAX(LEN(s.somestring)-LEN(REPLACE(s.somestring,':',''))) FROM @table s);

Alternatively, if you wanted to use John's technique, you could also use Dynamic SQL to create his query with the exact number of "pos" values required.