Converting a table with a key and comment field in

2019-07-31 01:23发布

问题:

I have a table with unstructured data I am trying to analyze to try to build a relational lookup. I do not have use of word cloud software.

I really have no idea how to solve this problem. Searching for solutions has lead me to tools that might do this for me that cost money, not coded solutions.

Basically my data looks like this:

CK1          CK2          Comment
--------------------------------------------------------------
 1            A           This is a comment.
 2            A           Another comment here.

And this is what I need to create:

CK1          CK2          Words
--------------------------------------------------------------
 1            A           This
 1            A           is
 1            A           a
 1            A           comment.
 2            A           Another
 2            A           comment
 2            A           here.

回答1:

What you are trying to do is tokenize a string using a space as a Delimiter. In the SQL world people often refer to functions that do this as a "Splitter". The potential pitfall of using a splitter for this type of thing is how words can be separated by multiple spaces, tabs, CHAR(10)'s, CHAR(13)'s, CHAR()'s, etc. Poor grammar, such as not adding a space after a period results in this:

" End of sentence.Next sentence" 

sentence.Next is returned as a word.

The way I like to tokenize human text is to:

  1. Replace any text that isn't a character with a space
  2. Replace duplicate spaces
  3. Trim the string
  4. Split the newly transformed string using a space as the delimiter.

Below is my solution followed by the DDL to create the functions used.

-- Sample Data
DECLARE @yourtable TABLE (CK1 INT, CK2 CHAR(1), Comment VARCHAR(8000));
INSERT @yourtable (CK1, CK2, Comment)
VALUES
(1,'A','This is a typical comment...Follewed by another...'),
(2,'A','This comment has  double  spaces  and       tabs and even carriage
returns!');

-- Solution
SELECT      t.CK1, t.CK2, split.itemNumber, split.itemIndex, split.itemLength, split.item
FROM        @yourtable                                              AS t
CROSS APPLY samd.patReplace(t.Comment,'[^a-zA-Z ]',' ')             AS c1
CROSS APPLY samd.removeDupChar8K(c1.newString,' ')                  AS c2 
CROSS APPLY samd.delimitedSplitAB8K(LTRIM(RTRIM(c2.NewString)),' ') AS split;

Results (truncated for brevity):

CK1         CK2  itemNumber           itemIndex   itemLength  item
----------- ---- -------------------- ----------- ----------- --------------
1           A    1                    1           4           This
1           A    2                    6           2           is
1           A    3                    9           1           a
1           A    4                    11          7           typical
1           A    5                    19          7           comment
...
2           A    1                    1           4           This
2           A    2                    6           7           comment
2           A    3                    14          3           has
2           A    4                    18          6           double
... 

Note that the splitter I'm using is based of Jeff Moden's Delimited Split8K with a couple tweeks.

Functions used:

CREATE FUNCTION dbo.rangeAB
(
  @low  bigint, 
  @high bigint, 
  @gap  bigint,
  @row1 bit
)
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH L1(N) AS 
(
  SELECT 1
  FROM (VALUES
   (0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),
   (0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),
   (0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),
   (0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),(0),
   (0),(0)) T(N) -- 90 values 
),
L2(N)  AS (SELECT 1 FROM L1 a CROSS JOIN L1 b CROSS JOIN L1 c),
iTally AS (SELECT rn = ROW_NUMBER() OVER (ORDER BY (SELECT 1)) FROM L2 a CROSS JOIN L2 b)
SELECT r.RN, r.OP, r.N1, r.N2
FROM
(
  SELECT
    RN = 0,
    OP = (@high-@low)/@gap,
    N1 = @low,
    N2 = @gap+@low
  WHERE @row1 = 0
  UNION ALL -- COALESCE required in the TOP statement below for error handling purposes
  SELECT TOP (ABS((COALESCE(@high,0)-COALESCE(@low,0))/COALESCE(@gap,0)+COALESCE(@row1,1)))
    RN = i.rn,
    OP = (@high-@low)/@gap+(2*@row1)-i.rn,
    N1 = (i.rn-@row1)*@gap+@low,
    N2 = (i.rn-(@row1-1))*@gap+@low
  FROM iTally AS i
  ORDER BY i.rn
) AS r
WHERE @high&@low&@gap&@row1 IS NOT NULL AND @high >= @low AND @gap > 0;
    GO

CREATE FUNCTION samd.NGrams8k
(
  @string VARCHAR(8000), -- Input string
  @N      INT            -- requested token size
)
RETURNS TABLE WITH SCHEMABINDING AS RETURN
SELECT
  position   = r.RN,
  token      = SUBSTRING(@string, CHECKSUM(r.RN), @N)
FROM  dbo.rangeAB(1, LEN(@string)+1-@N,1,1) AS r
WHERE @N > 0 AND @N <= LEN(@string);
GO

    CREATE FUNCTION samd.patReplace8K
(
  @string  VARCHAR(8000),
  @pattern VARCHAR(50),
  @replace VARCHAR(20)
) 
RETURNS TABLE WITH SCHEMABINDING AS RETURN
SELECT newString = 
  (
    SELECT   CASE WHEN @string = CAST('' AS VARCHAR(8000)) THEN CAST('' AS VARCHAR(8000))
                  WHEN @pattern+@replace+@string IS NOT NULL THEN 
                    CASE WHEN PATINDEX(@pattern,token COLLATE Latin1_General_BIN)=0
                         THEN ng.token ELSE @replace END END
    FROM     samd.NGrams8K(@string, 1) AS ng
    ORDER BY ng.position
    FOR XML PATH(''),TYPE
  ).value('text()[1]', 'VARCHAR(8000)');
GO

    CREATE FUNCTION samd.delimitedSplitAB8K
(
  @string    VARCHAR(8000), -- input string
  @delimiter CHAR(1)        -- delimiter
)
RETURNS TABLE WITH SCHEMABINDING AS RETURN
SELECT
  itemNumber   = ROW_NUMBER() OVER (ORDER BY d.p),
  itemIndex    = CHECKSUM(ISNULL(NULLIF(d.p+1, 0),1)),
  itemLength   = CHECKSUM(item.ln),
  item         = SUBSTRING(@string, d.p+1, item.ln)
FROM (VALUES (DATALENGTH(@string))) AS l(s) -- length of the string
CROSS APPLY
(
  SELECT 0 UNION ALL -- for handling leading delimiters
  SELECT ng.position
  FROM   samd.NGrams8K(@string, 1) AS ng
  WHERE  token = @delimiter
) AS d(p) -- delimiter.position
CROSS APPLY (VALUES(  --LEAD(d.p, 1, l.s+l.d) OVER (ORDER BY d.p) - (d.p+l.d)
  ISNULL(NULLIF(CHARINDEX(@delimiter,@string,d.p+1),0)-(d.p+1), l.s-d.p))) AS item(ln);
GO

CREATE FUNCTION dbo.RemoveDupChar8K(@string varchar(8000), @char char(1))
RETURNS TABLE WITH SCHEMABINDING AS RETURN

SELECT NewString = 
 replace(replace(replace(replace(replace(replace(replace(
 @string COLLATE LATIN1_GENERAL_BIN,
 replicate(@char,33), @char), --33
 replicate(@char,17), @char), --17
 replicate(@char,9 ), @char), -- 9
 replicate(@char,5 ), @char), -- 5
 replicate(@char,3 ), @char), -- 3 
 replicate(@char,2 ), @char), -- 2
 replicate(@char,2 ), @char); -- 2
GO


回答2:

1) If we are using SQL Server 2016 and above then we should probably use the built-in function STRING_SPLIT

-- SQL 2016and above
DECLARE @txt NVARCHAR(100) = N'This is a comment.'
select [value] from STRING_SPLIT(@txt, ' ') 

2) Only if 1 does not fit, then if the number of separation (the space in our case) is less then 3 which fit your sample data, then we should probably use PARSENAME

-- BEFORE SQL 2016 if we have less than 4 parts
DECLARE @txt NVARCHAR(100) = N'This is a comment.'
DECLARE @Temp NVARCHAR(200) = REPLACE (@txt,'.','@')
SELECT t FROM (VALUES(1),(2),(3),(4))T1(n)
CROSS APPLY (SELECT REPLACE(PARSENAME(REPLACE(@Temp,' ','.'),T1.n), '@','.'))T2(t)

3) Only if the 1 and 2 does not fit, then we should use SQLCLR function

http://dataeducation.com/sqlclr-string-splitting-part-2-even-faster-even-more-scalable/

4) Only if we cannot use 1,2 and we cannot use SQLCLR (which implies a real problematic administration and has nothing with security since you can have all the SQLCLR function in a read-only database for the use of all users, as I explain in my lectures), then you can use T-SQL and create UDF.

https://sqlperformance.com/2012/07/t-sql-queries/split-strings



标签: tsql