TSQL: Find a continuous number in a string

2019-08-14 08:59发布

问题:

I need to find a continuous 6 or 7 digit number in a string from column name Filename. The string has other numbers in it with dashes(or another character, like an underscore), but I only need the continuous number

The StudentID needs to be extracted from the filename. (I know the data is just wow, multiple vendors, multiple file naming formats is the cause.) Another option would be to just list the starting position of the continuous number.

Desired outcome:

Actual outcome:

Test Code:

DROP TABLE #StuID

CREATE TABLE #StuID (
 FILENAME VARCHAR(MAX)
,StudentID INT
)

INSERT INTO #StuID
( FILENAME  )
VALUES
 ('Smith John D, 11-23-1980, 1234567.pdf')
,('Doe Jane, _01_22_1980_123456.pdf')
,('John Doe, 567891.pdf' )

--This is what I tried.

SELECT FILENAME
, substring(FileName, patindex('%[0-9][0-9][0-9][0-9][0-9][0-9]%', FileName), 8) AS StudentID
FROM #StuID

回答1:

Because you want 6 or 7 digits, case might be the simplest solution:

SELECT FILENAME,
       (CASE WHEN FileName LIKE '%[0-9][0-9][0-9][0-9][0-9][0-9][0-9]%'
             THEN substring(FileName, patindex('%[0-9][0-9][0-9][0-9][0-9][0-9]%', FileName), 7)
             WHEN FileName LIKE '%[0-9][0-9][0-9][0-9][0-9][0-9]%'
             THEN substring(FileName, patindex('%[0-9][0-9][0-9][0-9][0-9]%', FileName), 6)
        END) AS StudentID                 
FROM #StuID


回答2:

Another approach I like a lot is a cast to XML and a XQuery filter:

WITH Casted([FileName],ToXml) AS
(
    SELECT [FILENAME] 
          ,CAST('<x>' + REPLACE(REPLACE(REPLACE([FILENAME],' ','</x><x>'),'.','</x><x>'),'_','</x><x>') + '</x>' AS XML)
    FROM #StuID
)
SELECT [FileName] 
      ,numbers.value('text()[1]','int')
FROM Casted
CROSS APPLY ToXml.nodes('/x[not(empty(. cast as xs:int?))]') A(numbers);

This will split the string in its fragments and return all fragments, which are numbers.

You can easily reduce the set to StudentIDs by using any convenient WHERE clause or you add to the XQuery filter the length of 6 or 7:

CROSS APPLY ToXml.nodes('/x[not(empty(. cast as xs:int?)) 
                            and (string-length(.)=6 or string-length(.)=7)]') A(numbers)

EDIT

This would be most on point:

CROSS APPLY ToXml.nodes('/x[. cast as xs:int? >= 100000 and . cast as xs:int? <10000000]') A(numbers)


回答3:

If you know that filetype is pdf then:

SELECT FILENAME
, substring(REPLACE(FileName, '.pdf',''), patindex('%[0-9][0-9][0-9][0-9][0-9][0-9]%', FileName), 8)
  AS StudentID
FROM #StuID;

db<>fiddle demo


More generic one (SQL Server 2017):

SELECT FILENAME
, substring(s.c, patindex('%[0-9][0-9][0-9][0-9][0-9][0-9]%', s.c), 8) AS StudentID
FROM #StuID
CROSS APPLY (SELECT trim(' !"#$%&\''()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~' FROM filename) AS c) s

db<>fiddle demo2