Neo4j Cypher query structure and performance optim

2019-01-29 06:50发布

问题:

I have created a Cypher query dynamic builder. For a complex cases this builder produces a quite big queries, for example:

MATCH (parentD)-[:CONTAINS]->(childD:Decision)-[ru:CREATED_BY]->(u:User) 
WHERE id(parentD) = {decisionId} 
MATCH (childD)<-[:SET_FOR]-(filterValue415431:Value)-[:SET_ON]->(filterCharacteristic415431:Characteristic) 
WHERE id(filterCharacteristic415431) = 415431 
WITH filterValue415431, childD, ru, u 
WHERE  ({filterValue4154311} IN filterValue415431.value ) 
 OR ({filterValue4154312} IN filterValue415431.value ) 
 OR ({filterValue4154313} IN filterValue415431.value ) 
 OR ({filterValue4154314} IN filterValue415431.value ) 
 OR ({filterValue4154315} IN filterValue415431.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue415441:Value)-[:SET_ON]->(filterCharacteristic415441:Characteristic) 
WHERE id(filterCharacteristic415441) = 415441 
WITH filterValue415441, childD, ru, u 
WHERE  ({filterValue4154416} IN filterValue415441.value ) 
 OR ({filterValue4154417} IN filterValue415441.value ) 
 OR ({filterValue4154418} IN filterValue415441.value ) 
 OR ({filterValue4154419} IN filterValue415441.value ) 
 OR ({filterValue41544110} IN filterValue415441.value ) 
 OR ({filterValue41544111} IN filterValue415441.value ) 
 OR ({filterValue41544112} IN filterValue415441.value ) 
 OR ({filterValue41544113} IN filterValue415441.value ) 
 OR ({filterValue41544114} IN filterValue415441.value ) 
 OR ({filterValue41544115} IN filterValue415441.value ) 
 OR ({filterValue41544116} IN filterValue415441.value ) 
 OR ({filterValue41544117} IN filterValue415441.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue416273:Value)-[:SET_ON]->(filterCharacteristic416273:Characteristic) 
WHERE id(filterCharacteristic416273) = 416273 
WITH filterValue416273, childD, ru, u 
WHERE  (filterValue416273.value >= {filterValue41627318}) 
 AND (filterValue416273.value <= {filterValue41627319})  
MATCH (childD)<-[:SET_FOR]-(filterValue417410:Value)-[:SET_ON]->(filterCharacteristic417410:Characteristic) 
WHERE id(filterCharacteristic417410) = 417410 
WITH filterValue417410, childD, ru, u 
MATCH (childD)<-[:SET_FOR]-(filterValue416423:Value)-[:SET_ON]->(filterCharacteristic416423:Characteristic) 
WHERE id(filterCharacteristic416423) = 416423 
WITH filterValue416423, childD, ru, u 
WHERE  ({filterValue41642320} IN filterValue416423.value ) 
 OR ({filterValue41642321} IN filterValue416423.value ) 
 OR ({filterValue41642322} IN filterValue416423.value ) 
 OR ({filterValue41642323} IN filterValue416423.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue415673:Value)-[:SET_ON]->(filterCharacteristic415673:Characteristic) 
WHERE id(filterCharacteristic415673) = 415673 
WITH filterValue415673, childD, ru, u 
WHERE  ({filterValue41567324} IN filterValue415673.value ) 
 OR ({filterValue41567325} IN filterValue415673.value ) 
 OR ({filterValue41567326} IN filterValue415673.value ) 
 OR ({filterValue41567327} IN filterValue415673.value ) 
 OR ({filterValue41567328} IN filterValue415673.value ) 
 OR ({filterValue41567329} IN filterValue415673.value ) 
 OR ({filterValue41567330} IN filterValue415673.value ) 
 OR ({filterValue41567331} IN filterValue415673.value ) 
 OR ({filterValue41567332} IN filterValue415673.value ) 
 OR ({filterValue41567333} IN filterValue415673.value ) 
 OR ({filterValue41567334} IN filterValue415673.value ) 
 OR ({filterValue41567335} IN filterValue415673.value ) 
 OR ({filterValue41567336} IN filterValue415673.value ) 
 OR ({filterValue41567337} IN filterValue415673.value ) 
 OR ({filterValue41567338} IN filterValue415673.value ) 
 OR ({filterValue41567339} IN filterValue415673.value ) 
OPTIONAL MATCH (childD)<-[:VOTED_FOR]-(vg:VoteGroup)-[:VOTED_ON]->(c:Criterion) 
WHERE id(c) IN {criteriaIds} 
WITH childD, ru, u, vg.avgVotesWeight as weight, vg.totalVotes as totalVotes 
WITH ru, u, childD , toFloat(sum(weight)) as weight, toInt(sum(totalVotes)) as totalVotes  
ORDER BY  weight DESC 
SKIP 0 LIMIT 10 
RETURN ru, u, childD AS decision, weight, totalVotes, 
 [ (parentD)<-[:DEFINED_BY]-(entity)<-[:COMMENTED_ON]-(comg:CommentGroup)-[:COMMENTED_FOR]->(childD) | 
  {entityId: id(entity),  types: labels(entity), totalComments: toInt(comg.totalComments)} ] AS commentGroups, 
 [ (parentD)<-[:DEFINED_BY]-(c1:Criterion)<-[:VOTED_ON]-(vg1:VoteGroup)-[:VOTED_FOR]->(childD) | 
  {criterionId: id(c1),  weight: vg1.avgVotesWeight, totalVotes: toInt(vg1.totalVotes)} ] AS weightedCriteria, 
 [ (parentD)<-[:DEFINED_BY]-(ch1:Characteristic)<-[:SET_ON]-(v1:Value)-[:SET_FOR]->(childD) | 
  {characteristicId: id(ch1),  value: v1.value, valueType: ch1.valueType, visualMode: ch1.visualMode} ] AS valuedCharacteristics

Right now I'm not very happy with a performance. For example call on this query takes ~500ms

Could you please take a look and tell if there is a chance to improve this query ?

UPDATED

This is a pretty much the same query but with a different parameters:

MATCH (parentD)-[:CONTAINS]->(childD:Decision)-[ru:CREATED_BY]->(u:User) 
WHERE id(parentD) = 415406 
MATCH (childD)<-[:SET_FOR]-(filterValue416423:Value)-[:SET_ON]->(filterCharacteristic416423:Characteristic) 
WHERE id(filterCharacteristic416423) = 416423 
WITH filterValue416423, childD, ru, u 
WHERE ('Adobe RGB' IN filterValue416423.value ) OR ('ECI RGB' IN filterValue416423.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue416273:Value)-[:SET_ON]->(filterCharacteristic416273:Characteristic) 
WHERE id(filterCharacteristic416273) = 416273 WITH filterValue416273, childD, ru, u 
WHERE  (filterValue416273.value >= 4) AND (filterValue416273.value <= 53)  
MATCH (childD)<-[:SET_FOR]-(filterValue415431:Value)-[:SET_ON]->(filterCharacteristic415431:Characteristic) 
WHERE id(filterCharacteristic415431) = 415431 WITH filterValue415431, childD, ru, u 
WHERE  ('Compact' IN filterValue415431.value ) 
  OR ('Compact SLR' IN filterValue415431.value ) 
  OR ('Large SLR' IN filterValue415431.value ) 
  OR ('Rangefinder-style mirrorless' IN filterValue415431.value ) 
  OR ('SLR-like (bridge)' IN filterValue415431.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue415441:Value)-[:SET_ON]->(filterCharacteristic415441:Characteristic) 
WHERE id(filterCharacteristic415441) = 415441 WITH filterValue415441, childD, ru, u 
WHERE  ('Brass' IN filterValue415441.value ) 
  OR ('Carbon fiber' IN filterValue415441.value )  
  OPTIONAL MATCH (childD)<-[:VOTED_FOR]-(vg:VoteGroup)-[:VOTED_ON]->(c:Criterion) 
WHERE id(c) IN [415414, 415415, 415412, 415426, 415411]  
WITH childD, ru, u, vg.avgVotesWeight as weight, vg.totalVotes as totalVotes 
WITH ru, u, childD , toFloat(sum(weight)) as weight, toInt(sum(totalVotes)) as totalVotes  
ORDER BY  weight DESC 
SKIP 0 LIMIT 10 
RETURN ru, u, childD AS decision, weight, totalVotes, 
[ (parentD)<-[:DEFINED_BY]-(entity)<-[:COMMENTED_ON]-(comg:CommentGroup)-[:COMMENTED_FOR]->(childD) | 
  {entityId: id(entity),  types: labels(entity), totalComments: toInt(comg.totalComments)} ] AS commentGroups, 
[ (parentD)<-[:DEFINED_BY]-(c1:Criterion)<-[:VOTED_ON]-(vg1:VoteGroup)-[:VOTED_FOR]->(childD) | 
  {criterionId: id(c1),  weight: vg1.avgVotesWeight, totalVotes: toInt(vg1.totalVotes)} ] AS weightedCriteria, 
[ (parentD)<-[:DEFINED_BY]-(ch1:Characteristic)<-[:SET_ON]-(v1:Value)-[:SET_FOR]->(childD) | 
  {characteristicId: id(ch1),  value: v1.value, valueType: ch1.valueType, visualMode: ch1.visualMode} ] AS valuedCharacteristics

Cypher version: CYPHER 3.1, planner: COST, runtime: INTERPRETED. 646192 total db hits in 390 ms.

UPDATED

This is the output of :schema

Indexes
   ON :Characteristic(lowerName) ONLINE
   ON :CharacteristicGroup(lowerName) ONLINE
   ON :Criterion(lowerName) ONLINE
   ON :CriterionGroup(lowerName) ONLINE
   ON :Decision(lowerName) ONLINE
   ON :FlagType(name) ONLINE (for uniqueness constraint)
   ON :HistoryValue(originalValue) ONLINE
   ON :Permission(code) ONLINE (for uniqueness constraint)
   ON :Role(name) ONLINE (for uniqueness constraint)
   ON :User(email) ONLINE (for uniqueness constraint)
   ON :User(username) ONLINE (for uniqueness constraint)
   ON :Value(value) ONLINE

Constraints
   ON ( flagtype:FlagType ) ASSERT flagtype.name IS UNIQUE
   ON ( permission:Permission ) ASSERT permission.code IS UNIQUE
   ON ( role:Role ) ASSERT role.name IS UNIQUE
   ON ( user:User ) ASSERT user.email IS UNIQUE
   ON ( user:User ) ASSERT user.username IS UNIQUE

UPDATED

I have optimized the query as suggest at the answer below:

MATCH (parentD)-[:CONTAINS]->(childD:Decision)
WHERE id(parentD) = 415406 
MATCH (childD)<-[:SET_FOR]-(filterValue416423)-[:SET_ON]->(filterCharacteristic416423) 
WHERE id(filterCharacteristic416423) = 416423 
WITH DISTINCT filterValue416423, childD 
WHERE ('Adobe RGB' IN filterValue416423.value ) OR ('ECI RGB' IN filterValue416423.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue416273)-[:SET_ON]->(filterCharacteristic416273) 
WHERE id(filterCharacteristic416273) = 416273 
WITH DISTINCT childD, filterValue416273  
WHERE  (filterValue416273.value >= 4) AND (filterValue416273.value <= 53)  
MATCH (childD)<-[:SET_FOR]-(filterValue415431)-[:SET_ON]->(filterCharacteristic415431) 
WHERE id(filterCharacteristic415431) = 415431 
WITH DISTINCT childD, filterValue415431 
WHERE  ('Compact' IN filterValue415431.value ) 
  OR ('Compact SLR' IN filterValue415431.value ) 
  OR ('Large SLR' IN filterValue415431.value ) 
  OR ('Rangefinder-style mirrorless' IN filterValue415431.value ) 
  OR ('SLR-like (bridge)' IN filterValue415431.value )  
MATCH (childD)<-[:SET_FOR]-(filterValue415441)-[:SET_ON]->(filterCharacteristic415441) 
WHERE id(filterCharacteristic415441) = 415441 
WITH DISTINCT childD, filterValue415441 
WHERE  ('Brass' IN filterValue415441.value ) 
  OR ('Carbon fiber' IN filterValue415441.value )  
  OPTIONAL MATCH (childD)<-[:VOTED_FOR]-(vg:VoteGroup)-[:VOTED_ON]->(c:Criterion) 
WHERE id(c) IN [415414, 415415, 415412, 415426, 415411]  

WITH DISTINCT * MATCH (childD)-[ru:CREATED_BY]->(u:User)  
WITH DISTINCT childD, ru, u, vg.avgVotesWeight as weight, vg.totalVotes as totalVotes 
WITH DISTINCT ru, u, childD , toFloat(sum(weight)) as weight, toInt(sum(totalVotes)) as totalVotes  
ORDER BY  weight DESC 
SKIP 0 LIMIT 10 
RETURN ru, u, childD AS decision, weight, totalVotes, 
[ (parentD)<-[:DEFINED_BY]-(entity)<-[:COMMENTED_ON]-(comg:CommentGroup)-[:COMMENTED_FOR]->(childD) | 
  {entityId: id(entity),  types: labels(entity), totalComments: toInt(comg.totalComments)} ] AS commentGroups, 
[ (parentD)<-[:DEFINED_BY]-(c1:Criterion)<-[:VOTED_ON]-(vg1:VoteGroup)-[:VOTED_FOR]->(childD) | 
  {criterionId: id(c1),  weight: vg1.avgVotesWeight, totalVotes: toInt(vg1.totalVotes)} ] AS weightedCriteria, 
[ (parentD)<-[:DEFINED_BY]-(ch1:Characteristic)<-[:SET_ON]-(v1)-[:SET_FOR]->(childD) | 
  {characteristicId: id(ch1),  value: v1.value, valueType: ch1.valueType, visualMode: ch1.visualMode} ] AS valuedCharacteristics

PROFILE output:

With DISTINCT childD the query works pretty slow, without much better but stil so far from perfect

One more try

PROFILE MATCH (parentD)-[:CONTAINS]->(childD:Decision)
WHERE id(parentD) = 415406 
MATCH (childD)<-[:SET_FOR]-(filterValue416423)-[:SET_ON]->(filterCharacteristic416423)
USING JOIN ON childD
WHERE id(filterCharacteristic416423) = 416423
AND ('Adobe RGB' IN filterValue416423.value ) OR ('ECI RGB' IN filterValue416423.value )
WITH DISTINCT childD
MATCH (childD)<-[:SET_FOR]-(filterValue416273)-[:SET_ON]->(filterCharacteristic416273) 
USING JOIN ON childD
WHERE id(filterCharacteristic416273) = 416273 AND (filterValue416273.value >= 4) AND (filterValue416273.value <= 53)
WITH DISTINCT childD  
MATCH (childD)<-[:SET_FOR]-(filterValue415431)-[:SET_ON]->(filterCharacteristic415431) 
USING JOIN ON childD
WHERE id(filterCharacteristic415431) = 415431
AND ('Compact' IN filterValue415431.value ) 
  OR ('Compact SLR' IN filterValue415431.value ) 
  OR ('Large SLR' IN filterValue415431.value ) 
  OR ('Rangefinder-style mirrorless' IN filterValue415431.value ) 
  OR ('SLR-like (bridge)' IN filterValue415431.value )
WITH DISTINCT childD
MATCH (childD)<-[:SET_FOR]-(filterValue415441)-[:SET_ON]->(filterCharacteristic415441) 
USING JOIN ON childD
WHERE id(filterCharacteristic415441) = 415441
AND ('Brass' IN filterValue415441.value ) 
  OR ('Carbon fiber' IN filterValue415441.value )  

OPTIONAL MATCH (childD)<-[:VOTED_FOR]-(vg:VoteGroup)-[:VOTED_ON]->(c:Criterion) 
WHERE id(c) IN [415414, 415415, 415412, 415426, 415411]  

WITH DISTINCT * MATCH (childD)-[ru:CREATED_BY]->(u:User)  
WITH DISTINCT childD, ru, u, vg.avgVotesWeight as weight, vg.totalVotes as totalVotes 
WITH DISTINCT ru, u, childD , toFloat(sum(weight)) as weight, toInt(sum(totalVotes)) as totalVotes  
ORDER BY  weight DESC 
SKIP 0 LIMIT 10 
RETURN childD

回答1:

The main problem with your query, is that you are basically doing a lot of checks, with rows running wild. So here are some tips to reduce how many rows you are generating at each MATCH.

1) Unless you NEED duplicates, use WITH DISTINCT instead of just WITH. WITH can create duplicate rows (because you only cut off a column), and every duplicate row you process is wasted time and extra DB hits. (Namely, every filter column you drop adds duplicate rows)

2) :Value.value is overloaded. It has no semantic meaning, and the value isn't even guaranteed to be any kind of type. That means every :Value check has to go out and touch a bunch of :Value nodes that have nothing to do with what your searching for. So as the number of attached :Value nodes increases, the more expensive it becomes to find the right one (This is less expensive if it could be indexed, so that it could just find the right :Value, and see what it is connected to. This doesn't help if you can't change the schema you're working with, and by schema, I mean how your data/relationships are setup).

3) Only check what you need to check. It might seem more efficient to say (a:A)-[:TO]->(b:B), but if all [:TO] are from :A to :B, Neo4j now has to verify that the first node is an :A and the second node is a :B. Cypher doesn't know what is implicitly true, so it has to do the check, but each of these redundant checks has to go out and hit the DB for every row. So it is better to say (a)-[:TO]->(b).

4) Limit variable scope. Here, you match -[ru:CREATED_BY]->(u:User) at the beginning but than don't use it til the end, with no filters. This multiplies how many rows you have by the number of -[ru:CREATED_BY]->(u:User) on each decision, that ALL have to be checked in the further matches. Unless -[ru:CREATED_BY]->(u:User) somehow greatly limits the matched decisions (or there can only be one per decision), match this support information at the end.

5) Order your filters from strongest to weakest (if you can). to cut as many rows as early as possible.

6) Tricks to minimize rows. Each row pulled up makes the following steps in the query have to work that much harder, so minimize rows in queries. If you are using OR to combine unrelated, but similar columns queries (like all orgs with conditions A or orgs with conditions B) and the work of the two queries just make things more expensive for the other half, it might be better to use UNION to combine the results of smaller, faster queries (and UNION can run in parallel up to the merge results). Note that simple queries like WHERE org.id in [1,2,3] are still faster than UNION, since the work can all be done in one lookup.

Aside from union, if you are collecting nodes that you don't filter on, you can use collect(column) to reduce 'duplicates' down to 1 row, and than UNWIND (column) as column at the end of the query to get your rows back! (column here referring to variable name)

7) Doing a lot of filters on 1 node? Cypher has USING hints for that! The hint USING JOIN ON column tells Cypher that it will probably be more efficient doing this match with more starting leafs and joining them. So using USING JOIN ON childD on each match will tell Cypher to do all the filters in parallel, and use the overlapping rows of all of them. Note that USINGs are just you telling Cypher "trust me, this should go faster if we try doing this" which can actually make the query worse if you are wrong. (USING JOIN should be useful though for making large queries more parallel though)

UPDATE:
First, a note on node.id = "constant" AND node.value = "constant" OR node.id = "constant2" AND node.value = "constant2" vs node.value = map[node.id]. The first query is able to do node filtering on node lookup, while the later has to filter through all of the nodes that where already looked up. Without previous filtering on that lookup, that means the map has to pull in all nodes. While the map offers some level of (arguable) simplicity/flexibility, it is one of the least efficient ways to filter nodes.

Second, The big problem with your query now, is the :Value is super overloaded, and you aren't finding it by ID. :Value should be a relationship, or have an indexed ID field so that you don't have to touch ALL <-[:SET_FOR]- and -[:SET_ON]->. Using the Join hint I think will at least make SET_FOR higher priority, which appears to be the more efficient of the two.

Here is my attempt to rewrite the PROFILE query more efficiently. (v1)

MATCH (parentD)-[:CONTAINS]->(childD:Decision)
WHERE id(parentD) = 415406 
MATCH (childD)<-[:SET_FOR]-(filterValue416423)-[:SET_ON]->(filterCharacteristic416423)
USING JOIN ON childD
WHERE id(filterCharacteristic416423) = 416423
WHERE ('Adobe RGB' IN filterValue416423.value ) OR ('ECI RGB' IN filterValue416423.value )
WITH DISTINCT childD
MATCH (childD)<-[:SET_FOR]-(filterValue416273)-[:SET_ON]->(filterCharacteristic416273) 
USING JOIN ON childD
WHERE id(filterCharacteristic416273) = 416273 AND (filterValue416273.value >= 4) AND (filterValue416273.value <= 53)
WITH DISTINCT childD  
MATCH (childD)<-[:SET_FOR]-(filterValue415431)-[:SET_ON]->(filterCharacteristic415431) 
USING JOIN ON childD
WHERE id(filterCharacteristic415431) = 415431
WHERE ('Compact' IN filterValue415431.value ) 
  OR ('Compact SLR' IN filterValue415431.value ) 
  OR ('Large SLR' IN filterValue415431.value ) 
  OR ('Rangefinder-style mirrorless' IN filterValue415431.value ) 
  OR ('SLR-like (bridge)' IN filterValue415431.value )
WITH DISTINCT childD
MATCH (childD)<-[:SET_FOR]-(filterValue415441)-[:SET_ON]->(filterCharacteristic415441) 
USING JOIN ON childD
WHERE id(filterCharacteristic415441) = 415441
WHERE  ('Brass' IN filterValue415441.value ) 
  OR ('Carbon fiber' IN filterValue415441.value )  

OPTIONAL MATCH (childD)<-[:VOTED_FOR]-(vg:VoteGroup)-[:VOTED_ON]->(c:Criterion) 
WHERE id(c) IN [415414, 415415, 415412, 415426, 415411]  

WITH DISTINCT * MATCH (childD)-[ru:CREATED_BY]->(u:User)  
WITH DISTINCT childD, ru, u, vg.avgVotesWeight as weight, vg.totalVotes as totalVotes 
WITH DISTINCT ru, u, childD , toFloat(sum(weight)) as weight, toInt(sum(totalVotes)) as totalVotes  
ORDER BY  weight DESC 
SKIP 0 LIMIT 10 
RETURN ru, u, childD AS decision, weight, totalVotes, 
[ (parentD)<-[:DEFINED_BY]-(entity)<-[:COMMENTED_ON]-(comg:CommentGroup)-[:COMMENTED_FOR]->(childD) | 
  {entityId: id(entity),  types: labels(entity), totalComments: toInt(comg.totalComments)} ] AS commentGroups, 
[ (parentD)<-[:DEFINED_BY]-(c1:Criterion)<-[:VOTED_ON]-(vg1:VoteGroup)-[:VOTED_FOR]->(childD) | 
  {criterionId: id(c1),  weight: vg1.avgVotesWeight, totalVotes: toInt(vg1.totalVotes)} ] AS weightedCriteria, 
[ (parentD)<-[:DEFINED_BY]-(ch1:Characteristic)<-[:SET_ON]-(v1)-[:SET_FOR]->(childD) | 
  {characteristicId: id(ch1),  value: v1.value, valueType: ch1.valueType, visualMode: ch1.visualMode} ] AS valuedCharacteristics


标签: neo4j cypher