可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
Each 'Product' can have as many as 10000 'Segment' rows. The segments have a sort column that starts at 1 for each product (1, 2, 3, 4, 5, ...) and a value column that can contain any values such as (323.113, 5423.231, 873.42, 422.64, 763.1, ...).
I would like to identify potential matches for products given a subset of segments. For example, if I have 5 segment values in the correct order, how can I efficiently find all the products which have all 5 segments in the same order somewhere in the Segment table?
Update
I posted a follow-up question to this here: Find a series of data using non-exact measurements (fuzzy logic)
回答1:
Assume tables like so:
CREATE TABLE Products
(
ProductId int not null
constraint PK_Products
primary key
,Name varchar(100) not null
)
CREATE TABLE Segments
(
ProductId int not null
constraint FK_Segments__Products
foreign key references Products (ProductId)
,OrderBy int not null
,Value float not null
,constraint PK_Segments
primary key (ProductId, OrderBy)
)
Next, set up your search data in a temp table:
CREATE TABLE #MatchThis
(
Position int not null
,Value float not null
)
For N search objects, this has to be populated like so
First item 0 <value 1>
Second item 1 <value 2>
Third item 2 <value 3>
...
Nth item N-1 <value N>
Now set up some important values. (This could be crammed into the final query, but this way makes it easier to read, and may improve performance slightly.)
DECLARE
@ItemCount int
,@FirstValue float
-- How many items to be matched ("N", above)
SELECT @ItemCount = count(*)
from #MatchThis
-- The value of the first item in the search set
SELECT @FirstValue = Value
from #MatchThis
where Position = 0
And then it's just a query:
SELECT
pr.Name
,fv.OrderBy -- Required by the Group By, but otherwise can be ignored
from #MatchThis mt
cross join (-- All Segments that match the first value in the set
select ProductId, OrderBy
from Segment
where Value = @FirstValue) fv
inner join Product pr -- Just to get the Product name
on pr.ProductId = fv.ProductId
inner join Segment se
on se.ProductId = fv.ProductId
and se.OrderBy = fv.OrderBy + mt.Position -- Lines them up based on the first value
and se.Value = mt.Value -- No join if the values don't match
group by
pr.Name
,fv.OrderBy
having count(*) = @ItemCount -- Only include if as many segments pulled for this product/segment.OrderBy as are required
I am convinced this will work, but I don't have time to test it out in detail just now. To optimize for performance, besides the primary keys indicated you could add a regular index on Segment.Value
回答2:
Probably the best performing way of doing this would be to store a denormalized version of the data.
ProductId, DelimitedList
1 ,323.113,5423.231,873.42,422.64,763.1,
Then your search is a simple
WHERE DelimitedList LIKE '%,323.113,5423.231,873.42,%'
You could do a standard relational division query first to bring back those ProductId
values that match all values (not necessarily in the correct order or contiguous) to reduce the number of strings that are searched.
Full Demo Script
/*Set up test tables*/
CREATE TABLE Products(
ProductId int primary key)
CREATE TABLE ProductSegments(
ProductId int REFERENCES Products,
Sort int,
Value decimal(10,3)
Primary key (ProductId,Sort))
CREATE NONCLUSTERED INDEX ix ON ProductSegments(ProductId,Value)
CREATE TABLE ProductSegmentsDenormalized
(
ProductId int REFERENCES Products,
DelimitedList varchar(max)
)
/*Insert some initial data to Products...*/
INSERT INTO Products VALUES (1),(2),(3)
/*... and for ProductSegments*/
;WITH numbers(N)
AS (SELECT TOP 10000 ROW_NUMBER() OVER (ORDER BY (SELECT 0))
FROM master..spt_values v1,
master..spt_values v2)
INSERT INTO ProductSegments
(ProductId,
Sort,
Value)
SELECT ProductId AS Product,
n1.N Sort,
( ABS(CHECKSUM(NEWID()))% 1000000000 ) / 1000.00
FROM numbers n1,
Products
/*Set up table for search data*/
DECLARE @SearchValues TABLE
(
Sequence int primary key,
Value decimal(10,3)
)
INSERT INTO @SearchValues
VALUES (1,323.113),(2,5423.231),(3,873.420),(4,422.640),(5,763.100)
/*Fiddle the test data so we have some guaranteed matches*/
UPDATE ps
SET ps.Value = sv.Value
FROM ProductSegments ps
JOIN @SearchValues sv ON ProductId = 1 AND Sort = 100 + Sequence
UPDATE ps
SET ps.Value = sv.Value
FROM ProductSegments ps
JOIN @SearchValues sv ON ProductId = 3 AND Sort = 987 + Sequence
/*Create the denormalised data*/
INSERT INTO ProductSegmentsDenormalized
SELECT ProductId, '|' + DelimitedList
FROM Products p
CROSS APPLY ( SELECT CAST(Value as varchar) + '|'
FROM ProductSegments ps
WHERE ps.ProductId = p.ProductId
ORDER BY Sort
FOR XML PATH('') ) D ( DelimitedList )
/*Do the search*/
SELECT ProductId
FROM ProductSegmentsDenormalized psd
WHERE psd.ProductId IN (SELECT p.ProductId
FROM Products p
WHERE NOT EXISTS (SELECT *
FROM @SearchValues sv
WHERE NOT EXISTS
(SELECT *
FROM ProductSegments ps
WHERE ps.ProductId = p.ProductId
AND sv.Value = ps.Value)))
AND DelimitedList LIKE '%|' + (SELECT CAST(Value AS VARCHAR) + '|'
FROM @SearchValues sv
ORDER BY Sequence
FOR XML PATH('')) + '%'
回答3:
Possibly what I am trying to suggest takes too much space: once you add a new product, since the segments are static, you may index them by taking all the "suffixes" of the segment list.
For example, the segments list:
34 57 67 34
will produce:
34 57 67 34
57 67 34
67 34
34
You may need to store them in hard drive files, because for 10000 segments for each product, you will get a lot of "suffixes" (up to 10000 per product actually). The good news is that you can store them contiguously so you don't have too many hard disk seeks.
Then, you can simply linearly scan the suffix list and match the first k values for a query that contains k segment values. So, if in the above list you are searching for 57 67, it will return that product, because it matches the second suffix.
You can also do a tree indexing for faster matching, but this can get too complicated.
Edit: As I was saying in the comment, this is an adaptation of substring matching. You must also sort the suffixes number by number, and then you can do binary search over the list of suffixes, which in theory should indicate a match/mismatch in log(10000) steps.