T-SQL Algorithm to Encode Unsafe HTML Characters a

2019-02-11 02:48发布

问题:

I need to create an after insert trigger in my SQL Server 2008 R2 database.

The trigger needs to take some fields from INSERTED, process them and put them in different fields in the same row which was inserted.

Re. processing:

I need help coming up with an algorithm in T-SQL code, which encodes unsafe HTML characters such as < (less than) and > (greater than) as HTML character references.

Ideally, I would like to do this in a manner similar to ASP.NET Server.HtmlEncode(myUnsafeHtml), and ideally without using literals all over the place or extra tables i.e. a clever algorithm.

Just asking for help.

So in summary, the after insert trigger takes the value in UnsafeHtml column, processes it and puts it in SafeHtml column in the row which was inserted.

Both columns are of type NVARCHAR.

Thanks in advance.

P.S. Having thought about it a UDF would be more appropriate which I could use in many places.

回答1:

Encoding five special characters with recursive CTE:

DECLARE 
  @unsafe NVARCHAR(MAX),
  @safe   NVARCHAR(MAX) 

--
-- Create the unsafe html string
-- 
SET @unsafe = N'html''s encoding "method" is <= or >= & 1234 ' + NCHAR(129) 
--
-- Use a recursive CTE to iterate through each character in the string
-- 
;WITH cte AS 
(
  -- 
  -- The first row will contain the original
  -- string, an empty string to be used to 
  -- build the "safe" string, and a position
  -- column to mark the character position
  -- of the loop
  -- 
  SELECT 
    @unsafe AS unsafe_html,
    CONVERT(NVARCHAR(MAX), '') AS safe_html,
    1 AS pos
  WHERE @unsafe IS NOT NULL AND LEN(@unsafe) > 0 
  UNION ALL
  -- 
  -- Create a loop: 
  -- The anchor row starts at position one.
  -- Increment the position by one for each pass.
  -- Stop when the position value is equal to the string lenth.
  -- Evaluate the character in each string
  -- If the ASCII value > 128, use the &# format.
  -- Otherwise, check for 5 special characters: " & ' < >
  -- Use the encoding reference or just the original character
  --
  SELECT 
    @unsafe AS unsafe_html,
    CONVERT(NVARCHAR(MAX), safe_html + 
    CASE WHEN UNICODE(SUBSTRING(unsafe_html, pos, 1)) > 128 
         THEN '&#' + CONVERT(NVARCHAR(10), UNICODE(SUBSTRING(unsafe_html, pos, 1)))  
         ELSE CASE SUBSTRING(unsafe_html, pos, 1)
              WHEN '"'  THEN '&quot' 
              WHEN '&'  THEN '&amp'
              WHEN '''' THEN '&apos'
              WHEN '<'  THEN '&lt'
              WHEN '>'  THEN '&gt'
              ELSE SUBSTRING(unsafe_html, pos, 1)
              END 
         END ) AS safe_html,
    pos + 1 AS pos
  FROM cte
  WHERE pos <= LEN(@unsafe)
) 
--
-- Each pass through the string creates a row in the CTE
-- The last row will have the position value of the string length + 1
-- Use that row as the safe html string
-- SQL Server allows a max recursion of 32767
-- 
SELECT @safe = (
  SELECT safe_html 
  FROM cte
  WHERE pos = LEN(@unsafe) + 1
) 
OPTION (MAXRECURSION 32767) 

SELECT @safe

-- html&aposs encoding &quotmethod&quot is &lt= or &gt= &amp 1234 &#129

Initial version:

DECLARE @s NVARCHAR(100)

SET @s = '<html>unsafe & safe<html>'
SELECT @s 
SELECT (SELECT @s FOR XML PATH(''))

---------------------------------------
<html>unsafe & safe<html>

-----------------------------------------
&lt;html&gt;unsafe &amp; safe&lt;html&gt;

Full encoding with all official references:

DECLARE 
    @unsafe  NVARCHAR(MAX),
    @safe NVARCHAR(MAX) 

-- Build string with first 10,000 unicode chars
SELECT @unsafe = COALESCE(@unsafe, '') + NCHAR(number) + ' ' 
FROM (
    SELECT TOP 10000 ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS number
    FROM sys.all_objects s1 CROSS JOIN sys.all_objects s2 
) t

-- Build table variable with character entity references defined in HTML 4.0
-- Reference: http://www.htmlcodetutorial.com/characterentities_famsupp_69.html
DECLARE @t TABLE (
    name NVARCHAR(25) NOT NULL, 
    unicode_val INT NOT NULL PRIMARY KEY 
) 

INSERT @t 
VALUES
('&quot', 34),
('&amp', 38),
('&apos', 39),
('&lt', 60),
('&gt', 62),
('&nbsp', 160),
('&iexcl', 161),
('&cent', 162),
('&pound', 163),
('&curren', 164),
('&yen', 165),
('&brvbar', 166),
('&sect', 167),
('&uml', 168),
('&copy', 169),
('&ordf', 170),
('&laquo', 171),
('&not', 172),
('&shy', 173),
('&reg', 174),
('&macr', 175),
('&deg', 176),
('&plusmn', 177),
('&sup2', 178),
('&sup3', 179),
('&acute', 180),
('&micro', 181),
('&para', 182),
('&middot', 183),
('&cedil', 184),
('&sup1', 185),
('&ordm', 186),
('&raquo', 187),
('&frac14', 188),
('&frac12', 189),
('&frac34', 190),
('&iquest', 191),
('&Agrave', 192),
('&Aacute', 193),
('&Acirc', 194),
('&Atilde', 195),
('&Auml', 196),
('&Aring', 197),
('&AElig', 198),
('&Ccedil', 199),
('&Egrave', 200),
('&Eacute', 201),
('&Ecirc', 202),
('&Euml', 203),
('&Igrave', 204),
('&Iacute', 205),
('&Icirc', 206),
('&Iuml', 207),
('&ETH', 208),
('&Ntilde', 209),
('&Ograve', 210),
('&Oacute', 211),
('&Ocirc', 212),
('&Otilde', 213),
('&Ouml', 214),
('&times', 215),
('&Oslash', 216),
('&Ugrave', 217),
('&Uacute', 218),
('&Ucirc', 219),
('&Uuml', 220),
('&Yacute', 221),
('&THORN', 222),
('&szlig', 223),
('&agrave', 224),
('&aacute', 225),
('&acirc', 226),
('&atilde', 227),
('&auml', 228),
('&aring', 229),
('&aelig', 230),
('&ccedil', 231),
('&egrave', 232),
('&eacute', 233),
('&ecirc', 234),
('&euml', 235),
('&igrave', 236),
('&iacute', 237),
('&icirc', 238),
('&iuml', 239),
('&eth', 240),
('&ntilde', 241),
('&ograve', 242),
('&oacute', 243),
('&ocirc', 244),
('&otilde', 245),
('&ouml', 246),
('&divide', 247),
('&oslash', 248),
('&ugrave', 249),
('&uacute', 250),
('&ucirc', 251),
('&uuml', 252),
('&yacute', 253),
('&thorn', 254),
('&yuml', 255),
('&OElig', 338),
('&oelig', 339),
('&Scaron', 352),
('&scaron', 353),
('&Yuml', 376),
('&fnof', 402),
('&circ', 710),
('&tilde', 732),
('&Alpha', 913),
('&Beta', 914),
('&Gamma', 915),
('&Delta', 916),
('&Epsilon', 917),
('&Zeta', 918),
('&Eta', 919),
('&Theta', 920),
('&Iota', 921),
('&Kappa', 922),
('&Lambda', 923),
('&Mu', 924),
('&Nu', 925),
('&Xi', 926),
('&Omicron', 927),
('&Pi', 928),
('&Rho', 929),
('&Sigma', 931),
('&Tau', 932),
('&Upsilon', 933),
('&Phi', 934),
('&Chi', 935),
('&Psi', 936),
('&Omega', 937),
('&alpha', 945),
('&beta', 946),
('&gamma', 947),
('&delta', 948),
('&epsilon', 949),
('&zeta', 950),
('&eta', 951),
('&theta', 952),
('&iota', 953),
('&kappa', 954),
('&lambda', 955),
('&mu', 956),
('&nu', 957),
('&xi', 958),
('&omicron', 959),
('&pi', 960),
('&rho', 961),
('&sigmaf', 962),
('&sigma', 963),
('&tau', 964),
('&upsilon', 965),
('&phi', 966),
('&chi', 967),
('&psi', 968),
('&omega', 969),
('&thetasym', 977),
('&upsih', 978),
('&piv', 982),
('&ensp', 8194),
('&emsp', 8195),
('&thinsp', 8201),
('&zwnj', 8204),
('&zwj', 8205),
('&lrm', 8206),
('&rlm', 8207),
('&ndash', 8211),
('&mdash', 8212),
('&lsquo', 8216),
('&rsquo', 8217),
('&sbquo', 8218),
('&ldquo', 8220),
('&rdquo', 8221),
('&bdquo', 8222),
('&dagger', 8224),
('&Dagger', 8225),
('&bull', 8226),
('&hellip', 8230),
('&permil', 8240),
('&prime', 8242),
('&Prime', 8243),
('&lsaquo', 8249),
('&rsaquo', 8250),
('&oline', 8254),
('&frasl', 8260),
('&euro', 8364),
('&image', 8465),
('&weierp', 8472),
('&real', 8476),
('&trade', 8482),
('&alefsym', 8501),
('&larr', 8592),
('&uarr', 8593),
('&rarr', 8594),
('&darr', 8595),
('&harr', 8596),
('&crarr', 8629),
('&lArr', 8656),
('&uArr', 8657),
('&rArr', 8658),
('&dArr', 8659),
('&hArr', 8660),
('&forall', 8704),
('&part', 8706),
('&exist', 8707),
('&empty', 8709),
('&nabla', 8711),
('&isin', 8712),
('&notin', 8713),
('&ni', 8715),
('&prod', 8719),
('&sum', 8721),
('&minus', 8722),
('&lowast', 8727),
('&radic', 8730),
('&prop', 8733),
('&infin', 8734),
('&ang', 8736),
('&and', 8743),
('&or', 8744),
('&cap', 8745),
('&cup', 8746),
('&int', 8747),
('&there4', 8756),
('&sim', 8764),
('&cong', 8773),
('&asymp', 8776),
('&ne', 8800),
('&equiv', 8801),
('&le', 8804),
('&ge', 8805),
('&sub', 8834),
('&sup', 8835),
('&nsub', 8836),
('&sube', 8838),
('&supe', 8839),
('&oplus', 8853),
('&otimes', 8855),
('&perp', 8869),
('&sdot', 8901),
('&lceil', 8968),
('&rceil', 8969),
('&lfloor', 8970),
('&rfloor', 8971),
('&lang', 9001),
('&rang', 9002),
('&loz', 9674),
('&spades', 9824),
('&clubs', 9827),
('&hearts', 9829),
('&diams', 9830)

-- Build numbers table to parse the string
DECLARE @numbers TABLE (number INT NOT NULL PRIMARY KEY) 
INSERT @numbers
SELECT TOP (LEN(@unsafe)) ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS number
FROM sys.all_objects s1 CROSS JOIN sys.all_objects s2

-- Use numbers table to parse each character.
-- If a match is found in character entity reference table,
-- then use the safe substitute. Otherwise, if the unicode
-- value is greater than 128, use &#<unicode char value>.
-- Finally, use the original character if nothing else
-- is a match
SELECT @safe = COALESCE(@safe,'') 
       + COALESCE(name, 
         CASE WHEN UNICODE(SUBSTRING(@unsafe, number, 1)) > 128 THEN '&#' 
          + CONVERT(NVARCHAR(10), UNICODE(SUBSTRING(@unsafe, number, 1))) 
          ELSE SUBSTRING(@unsafe, number, 1) END)
FROM @numbers 
LEFT OUTER JOIN @t  
  ON UNICODE(SUBSTRING(@unsafe, number, 1)) = unicode_val

SELECT @safe AS [safe]

Results:  
! &quot # $ % &amp &apos ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; 
&lt = &gt ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 
[ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { 
| } ~   &#129 &#130 &#131 &#132 &#133 &#134 &#135 &#136 &#137 &#138 
&#139 &#140 &#141 &#142 &#143 &#144 &#145 &#146 &#147 &#148 &#149 
&#150 &#151 &#152 &#153 &#154 &#155 &#156 &#157 &#158 &#159 &nbsp 
&iexcl &cent &pound &curren &yen &brvbar &sect &uml &copy &ordf 
&laquo &not &shy &reg &macr &deg &plusmn &sup2 &sup3 &acute &micro 
&para &middot &cedil &sup1 &ordm &raquo &frac14 &frac12 &frac34 
&iquest &Agrave &Aacute &Acirc &Atilde &Auml &Aring &AElig &Ccedil 
&Egrave &Eacute &Ecirc &Euml &Igrave &Iacute &Icirc &Iuml &ETH &Ntilde 
&Ograve &Oacute &Ocirc &Otilde &Ouml &times &Oslash &Ugrave &Uacute 
&Ucirc &Uuml &Yacute &THORN &szlig &agrave &aacute &acirc &atilde &auml 
&aring &aelig &ccedil &egrave &eacute &ecirc &euml &igrave &iacute &icirc 
&iuml &eth &ntilde &ograve &oacute &ocirc &otilde &ouml &divide &oslash 
&ugrave &uacute &ucirc &uuml &yacute &thorn &yuml &#256 &#257 &#258 &#259 
&#260 &#261 &#262 &#263 &#264 &#265 &#266...


回答2:

Declare @Expression nvarchar(max) = 'Y<1'

Select @Expression = (Select @Expression For XML Path(''))

Select @Expression


回答3:

How about creating a SQL UDF in .NET that calls Server.HtmlEncode(myUnsafeHtml) and loading that into the SQL server? To get started:

http://blog.sqlauthority.com/2008/10/19/sql-server-introduction-to-clr-simple-example-of-clr-stored-procedure/

http://www.codeproject.com/Articles/37377/SQL-Server-CLR-Functions

http://msdn.microsoft.com/en-us/library/ms254498(v=vs.80).aspx