I'm trying to aggregate approximately two million rows based on user.
One user has several Transactions, each Transaction has a Platform and a TransactionType.I aggregate Platform and TransactionType columns as json and save as a single row.
But my code is slow.
How can I improve the performance?
public static void AggregateTransactions()
{
using (var db = new ApplicationDbContext())
{
db.ChangeTracker.AutoDetectChangesEnabled = false;
//Get a list of users who have transactions
var users = db.Transactions
.Select(x => x.User)
.Distinct();
foreach (var user in users.ToList())
{
//Get all transactions for a particular user
var _transactions = db.Transactions
.Include(x => x.Platform)
.Include(x => x.TransactionType)
.Where(x => x.User == user)
.ToList();
//Aggregate Platforms from all transactions for user
Dictionary<string, int> platforms = new Dictionary<string, int>();
foreach (var item in _transactions.Select(x => x.Platform).GroupBy(x => x.Name).ToList())
{
platforms.Add(item.Key, item.Count());
};
//Aggregate TransactionTypes from all transactions for user
Dictionary<string, int> transactionTypes = new Dictionary<string, int>();
foreach (var item in _transactions.Select(x => x.TransactionType).GroupBy(x => x.Name).ToList())
{
transactionTypes.Add(item.Key, item.Count());
};
db.Add<TransactionByDay>(new TransactionByDay
{
User = user,
Platforms = platforms, //The dictionary list is represented as json in table
TransactionTypes = transactionTypes //The dictionary list is represented as json in table
});
db.SaveChanges();
}
}
}
Update
So a basic view of the data would look like the following:
Tansactions Data:
Id: b11c6b67-6c74-4bbe-f712-08d609af20cf,
UserId: 1,
PlatformId: 3,
TransactionypeId: 1
Id: 4782803f-2f6b-4d99-f717-08d609af20cf,
UserId: 1,
PlatformId: 3,
TransactionypeId: 4
Aggregate data as TransactionPerDay:
Id: 9df41ef2-2fc8-441b-4a2f-08d609e21559,
UserId: 1,
Platforms: {"p3":2},
TransactionsTypes: {"t1":1,"t4":1}
So in this case, two transactions are aggregated into one. You can see that the platforms and transaction types will be aggregated as json.
You probably should not be calling db.saveChanges() within the loop. Putting it outside the loop to persist the changes once, may help.
But having said this, when dealing with large volumes of data and performance is key, I've found that ADO.NET is probably a better choice. This does not mean you have to stop using Entity Framework, but perhaps for this method you could use ADO.NET. If you go down this path you could either:
Create a stored procedure to return the data you need to work on, populate a datatable, manipulate the data and the persist everything in bulk using sqlBulkCopy.
Use a stored procedure to completely perform this operation. This avoids the need to shuttle the data to your application and the entire processing can happen within the database itself.
Linq To EF is not built for speed (LinqToSQL is easier and faster IMHO, or you could run direct SQL commands with Linq EF\SQL). Anyway, I don't know how this would speed wise:
using (var db = new MyContext(connectionstring))
{
var tbd = (from t in db.Transactions
group t by t.User
into g
let platforms = g.GroupBy(tt => tt.Platform.Name)
let trantypes = g.GroupBy(tt => tt.TransactionType.Name)
select new {
User = g.Key,
Platforms = platforms,
TransactionTypes = trantypes
}).ToList()
.Select(u => new TransactionByDay {
User=u.User,
Platforms=u.Platforms.ToDictionary(tt => tt.Key, tt => tt.Count()),
TransactionTypes = u.TransactionTypes.ToDictionary(tt => tt.Key, tt => tt.Count())
});
//...
}
The idea is to try to do less queries and includes by getting as much data as needed first. So there is no need to include with every transaction the Platform
and TransactionType
, where you can just query them once in a Dictionary
and look the data up. Further more we could do our processing in Parallel, then save all the data at once.
public static void AggregateTransactions()
{
using (var db = new ApplicationDbContext())
{
db.ChangeTracker.AutoDetectChangesEnabled = false;
//Get a list of users who have transactions
var transactionsByUser = db.Transactions
.GroupBy(x => x.User) //Not sure if EF Core supports this kind of grouping
.ToList();
var platforms = db.Platforms.ToDictionary(ks => ks.PlatformId);
var Transactiontypes = db.TransactionTypes.ToDictionary(ks => ks.TransactionTypeId);
var bag = new ConccurentBag<TransactionByDay>();
Parallel.ForEach(transactionsByUser, transaction =>
{
//Aggregate Platforms from all transactions for user
Dictionary<string, int> platforms = new Dictionary<string, int>(); //This can be converted to a ConccurentDictionary
//This can be converted to Parallel.ForEach
foreach (var item in _transactions.Select(x => platforms[x.PlatformId]).GroupBy(x => x.Name).ToList())
{
platforms.Add(item.Key, item.Count());
};
//Aggregate TransactionTypes from all transactions for user
Dictionary<string, int> transactionTypes = new Dictionary<string, int>(); //This can be converted to a ConccurentDictionary
//This can be converted to Parallel.ForEach
foreach (var item in _transactions.Select(x => Transactiontypes[c.TransactionTypeId]).GroupBy(x => x.Name).ToList())
{
transactionTypes.Add(item.Key, item.Count());
};
bag.Add(new TransactionByDay
{
User = transaction.Key,
Platforms = platforms, //The dictionary list is represented as json in table
TransactionTypes = transactionTypes //The dictionary list is represented as json in table
});
});
//Before calling this we may need to check the status of the Parallel ForEach, or just convert it back to regular foreach loop if you see no benefit.
db.AddRange(bag);
db.SaveChanges();
}
}
Variation #2
public static void AggregateTransactions()
{
using (var db = new ApplicationDbContext())
{
db.ChangeTracker.AutoDetectChangesEnabled = false;
//Get a list of users who have transactions
var users = db.Transactions
.Select(x => x.User)
.Distinct().ToList();
var platforms = db.Platforms.ToDictionary(ks => ks.PlatformId);
var Transactiontypes = db.TransactionTypes.ToDictionary(ks => ks.TransactionTypeId);
var bag = new ConccurentBag<TransactionByDay>();
Parallel.ForEach(users, user =>
{
var _transactions = db.Transactions
.Where(x => x.User == user)
.ToList();
//Aggregate Platforms from all transactions for user
Dictionary<string, int> userPlatforms = new Dictionary<string, int>();
Dictionary<string, int> userTransactions = new Dictionary<string, int>();
foreach(var transaction in _transactions)
{
if(platforms.TryGetValue(transaction.PlatformId, out var platform))
{
if(userPlatforms.TryGetValue(platform.Name, out var tmp))
{
userPlatforms[platform.Name] = tmp + 1;
}
else
{
userPlatforms.Add(platform.Name, 1);
}
}
if(Transactiontypes.TryGetValue(transaction.TransactionTypeId, out var type))
{
if(userTransactions.TryGetValue(type.Name, out var tmp))
{
userTransactions[type.Name] = tmp + 1;
}
else
{
userTransactions.Add(type.Name, 1);
}
}
}
bag.Add(new TransactionByDay
{
User = user,
Platforms = userPlatforms, //The dictionary list is represented as json in table
TransactionTypes = userTransactions //The dictionary list is represented as json in table
});
});
db.AddRange(bag);
db.SaveChanges();
}
}