find the common words in title of the books in Excel, the output like this :
'book common user_id
physics physics 1
Principles of plasma physics physics,plasma 2
Fundamentals of plasma physics fundamentals,plasma,physics 3
Fundamentals of thermodynamics fundamentals 4
So here's my shot at this problem. I am aware that the code is rather messy: I've been very sloppy with variable names, error handling and so on, but it gives you an idea of how it can be done. I've created a UDF Common()
which takes 4 arguments:
- rngText: a reference to a single cell containing the text (in your case book) you want to comare
- compareList: a range of cells with which to compare the first argument
- minOccurences (optional): this is the definition of the minimum number of occurences a word should have to be considered "common". The default vanue is 2
- exclusionList (optional): a range of cells containing text that should be excluded (e.g. words like "a", "of", ...)
So for example, if you have your titles in A2:A7 and your exclusion list in E2:E3, you could use the formula = Common( A2, $A$2:$A$7, , $E$2:$E$3 )
in cell B2 and copy down to B7.
Option Explicit
Function Common(rngText As Range, compareList As Range, _
Optional minOccurences As Integer = 2, Optional exclusionList As Range) As Variant
'Check if an exclusion list is provided
Dim exclusionListProvided As Boolean
If Not (exclusionList Is Nothing) Then
exclusionListProvided = True
exclusionListProvided = False
End If
'Check the argments
Dim returnError As Boolean
If IsDate(rngText.Value) Or IsNumeric(rngText.Value) Or IsError(rngText.Value) Then 'first argument should refer to a cell containing text
returnError = True
ElseIf minOccurences < 2 Then 'Function should check for at least 2 occurences
returnError = True
ElseIf (compareList.Columns.Count > 1 And compareList.Rows.Count > 1) Then 'compareList should be one-dimensional
returnError = True
ElseIf exclusionListProvided Then
If (exclusionList.Columns.Count > 1 And exclusionList.Rows.Count > 1) Then 'exclusionList should be one-dimensional
returnError = True
End If
returnError = False
End If
'Return an error if one of the arguments is unexpected
If returnError Then
Common = CVErr(xlErrValue)
Dim text As String
text = rngText.Value
'split text into an array of words
Dim words() As String
words = fullSplit(text)
'convert exclusionlist and compareList to arrays
Dim arrExclude()
If exclusionListProvided Then
arrExclude() = rangeToStringArray(exclusionList)
End If
Dim arrCompare()
arrCompare() = rangeToStringArray(compareList)
Dim strCommon As String
'loop through words in text
Dim i As Integer
Dim j As Integer
Dim k As Integer
Dim nOccurences As Integer
Dim excluded As Boolean
Dim compareWords() As String
For i = LBound(words) To UBound(words)
'check if word is in exclusion list
excluded = False
If exclusionListProvided Then
For j = LBound(arrExclude) To UBound(arrExclude)
compareWords = fullSplit(arrExclude(j))
For k = LBound(compareWords) To UBound(compareWords)
If compareWords(k) = words(i) Then
excluded = True
Exit For
End If
Next k
If excluded Then Exit For
Next j
End If
'count the number of occurences of the word in the compare list
If Not excluded Then
nOccurences = 0
For j = LBound(arrCompare) To UBound(arrCompare)
compareWords = fullSplit(arrCompare(j))
For k = LBound(compareWords) To UBound(compareWords)
If LCase(compareWords(k)) = LCase(words(i)) Then
nOccurences = nOccurences + 1
Exit For
End If
Next k
Next j
If nOccurences >= minOccurences Then
If Not strCommon = "" Then
strCommon = strCommon & ", "
End If
strCommon = strCommon & LCase(words(i))
End If
End If
Next i
Common = strCommon
End If
End Function
'split text by using a list of delimiters
Function fullSplit(text As Variant)
'define list of delimiters
Dim delimiters()
delimiters = Array(" ", ",", ".", ";", "?", "!")
'unique delimiter is the first one from the list
Dim uniqueDelimiter As String
uniqueDelimiter = delimiters(0)
'replace all delimiters in the text by the unique delimiter
Dim i As Integer
For i = LBound(delimiters) + 1 To UBound(delimiters)
Replace text, delimiters(i), uniqueDelimiter
Next i
'split the text by using the unique delimiter
fullSplit = SplitText(text, uniqueDelimiter)
End Function
'split text by using a single delimiter
Function SplitText(text As Variant, delimiter As String)
'split the text in substrings on each occurence of the delimiter
Dim tempArray() As String
tempArray = Split(text, delimiter)
'remove empty substrings
Dim LastNonEmpty As Integer
LastNonEmpty = -1
Dim i As Integer
For i = LBound(tempArray) To UBound(tempArray)
If tempArray(i) <> "" Then
LastNonEmpty = LastNonEmpty + 1
tempArray(LastNonEmpty) = tempArray(i)
End If
ReDim Preserve tempArray(0 To LastNonEmpty)
SplitText = tempArray
End Function
'check if two arrays share a least one element
Function sharedElements(array1() As Variant, array2() As Variant) As Boolean
Dim found As Boolean
found = False
Dim i As Integer
Dim j As Integer
For i = LBound(array1) To UBound(array1)
For j = LBound(array2) To UBound(array2)
If array1(i) = array2(j) Then
found = True
Exit For
End If
Next j
If found = True Then Exit For
Next i
sharedElements = found
End Function
'converts a range to an array of strings, omitting all non-text cells
Function rangeToStringArray(myRange As Range)
Dim myArray()
Dim arraySize As Integer
arraySize = 0
Dim c As Object
For Each c In myRange
If IsDate(c.Value) = False And IsNumeric(c.Value) = False And IsError(c.Value) = False Then
ReDim Preserve myArray(arraySize)
myArray(arraySize) = c.Value
arraySize = arraySize + 1
End If
rangeToStringArray = myArray
End Function