Case-insensitive Lua pattern-matching

2019-01-18 23:26发布

问题:

I'm writing a grep utility in Lua for our mobile devices running Windows CE 6/7, but I've run into some issues implementing case-insensitive match patterns. The obvious solution of converting everything to uppercase (or lower) does not work so simply due to the character classes.

The only other thing I can think of is converting the literals in the pattern itself to uppercase.

Here's what I have so far:

function toUpperPattern(instr)
    -- Check first character
    if string.find(instr, "^%l") then
        instr = string.upper(string.sub(instr, 1, 1)) .. string.sub(instr, 2)
    end
    -- Check the rest of the pattern
    while 1 do
        local a, b, str = string.find(instr, "[^%%](%l+)")
        if not a then break end
        if str then
            instr = string.sub(instr, 1, a) .. string.upper(string.sub(instr, a+1, b)) .. string.sub(instr, b + 1)
        end
    end
    return instr
end

I hate to admit how long it took to get even that far, and I can still see right away there are going to be problems with things like escaped percent signs '%%'

I figured this must be a fairly common issue, but I can't seem to find much on the topic. Are there any easier (or at least complete) ways to do this? I'm starting to go crazy here... Hoping you Lua gurus out there can enlighten me!

回答1:

Try something like this:

function case_insensitive_pattern(pattern)

  -- find an optional '%' (group 1) followed by any character (group 2)
  local p = pattern:gsub("(%%?)(.)", function(percent, letter)

    if percent ~= "" or not letter:match("%a") then
      -- if the '%' matched, or `letter` is not a letter, return "as is"
      return percent .. letter
    else
      -- else, return a case-insensitive character class of the matched letter
      return string.format("[%s%s]", letter:lower(), letter:upper())
    end

  end)

  return p
end

print(case_insensitive_pattern("xyz = %d+ or %% end"))

which prints:

[xX][yY][zZ] = %d+ [oO][rR] %% [eE][nN][dD]


回答2:

Lua 5.1, LPeg v0.12

do
    local p = re.compile([[
        pattern  <- ( {b} / {escaped} / brackets / other)+
        b        <- "%b" . .
        escaped  <- "%" .
        brackets <- { "[" ([^]%]+ / escaped)* "]" }
        other    <- [^[%]+ -> cases
    ]], {
        cases = function(str) return (str:gsub('%a',function(a) return '['..a:lower()..a:upper()..']' end)) end
    })
    local pb = re.compile([[
        pattern  <- ( {b} / {escaped} / brackets / other)+
        b        <- "%b" . .
        escaped  <- "%" .
        brackets <- {: {"["} ({escaped} / bcases)* {"]"} :}
        bcases   <- [^]%]+ -> bcases
        other    <- [^[%]+ -> cases
    ]], {
        cases = function(str) return (str:gsub('%a',function(a) return '['..a:lower()..a:upper()..']' end)) end
        , bcases = function(str) return (str:gsub('%a',function(a) return a:lower()..a:upper() end)) end
    })
    function iPattern(pattern,brackets)
        ('sanity check'):find(pattern)
        return table.concat({re.match(pattern, brackets and pb or p)})
    end
end

local test                  = '[ab%c%]d%%]+ o%%r %bnm'
print(iPattern(test))       -- [ab%c%]d%%]+ [oO]%%[rR] %bnm
print(iPattern(test,true))  -- [aAbB%c%]dD%%]+ [oO]%%[rR] %bnm
print(('qwe [%D]% O%r n---m asd'):match(iPattern(test, true))) -- %D]% O%r n---m

Pure Lua version:

It is necessary to analyze all the characters in the string to convert it into a correct pattern because Lua patterns do not have alternations like in regexps (abc|something).

function iPattern(pattern, brackets)
    ('sanity check'):find(pattern)
    local tmp = {}
    local i=1
    while i <= #pattern do              -- 'for' don't let change counter
        local char = pattern:sub(i,i)   -- current char
        if char == '%' then
            tmp[#tmp+1] = char          -- add to tmp table
            i=i+1                       -- next char position
            char = pattern:sub(i,i)
            tmp[#tmp+1] = char
            if char == 'b' then         -- '%bxy' - add next 2 chars
                tmp[#tmp+1] = pattern:sub(i+1,i+2)
                i=i+2
            end
        elseif char=='[' then           -- brackets
            tmp[#tmp+1] = char
            i = i+1
            while i <= #pattern do
                char = pattern:sub(i,i)
                if char == '%' then     -- no '%bxy' inside brackets
                    tmp[#tmp+1] = char
                    tmp[#tmp+1] = pattern:sub(i+1,i+1)
                    i = i+1
                elseif char:match("%a") then    -- letter
                    tmp[#tmp+1] = not brackets and char or char:lower()..char:upper()
                else                            -- something else
                    tmp[#tmp+1] = char
                end
                if char==']' then break end -- close bracket
                i = i+1
            end
        elseif char:match("%a") then    -- letter
            tmp[#tmp+1] = '['..char:lower()..char:upper()..']'
        else
            tmp[#tmp+1] = char          -- something else
        end
        i=i+1
    end
    return table.concat(tmp)
end

local test                  = '[ab%c%]d%%]+ o%%r %bnm'
print(iPattern(test))       -- [ab%c%]d%%]+ [oO]%%[rR] %bnm
print(iPattern(test,true))  -- [aAbB%c%]dD%%]+ [oO]%%[rR] %bnm
print(('qwe [%D]% O%r n---m asd'):match(iPattern(test, true))) -- %D]% O%r n---m