-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathFuzzy String Match FunctionV1.R
More file actions
88 lines (75 loc) · 3.37 KB
/
Fuzzy String Match FunctionV1.R
File metadata and controls
88 lines (75 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#install.packages("stringdist")
library(stringdist)
# This function calaculates the string distance between two string variables and selects the match with the
# smallest distance. It takes 7 arguments:
# dat1 - Reference dataset, select the one you want to return with appended matches
# dat2 - Comparison dataset
# string1 - dat1 variable name in string format for comparison
# string2 - dat2 variable name in string format for comparison
# meth - stringdist(method=) method. See ?stringdist for details
# osa Optimal string aligment, (restricted Damerau-Levenshtein distance).
# lv Levenshtein distance (as in R's native adist).
# dl Full Damerau-Levenshtein distance.
# hamming Hamming distance (a and b must have same nr of characters).
# lcs Longest common substring distance.
# qgram q-gram distance.
# cosine cosine distance between q-gram profiles
# jaccard Jaccard distance between q-gram profiles
# jw Jaro, or Jaro-Winker distance.
# soundex Distance based on soundex encoding (see below)
# [OPTIONAL] id1 - dat1 variable name you want to retain after matching
# [OPTIONAL] id2 - dat2 variable name you want to retain after matching
fuzzymatch<-function(dat1,dat2,string1,string2,meth,id1,id2){
#initialize Variables:
matchfile <-NULL #iterate appends
x<-nrow(dat1) #count number of rows in input, for max number of runs
#Check to see if function has ID values. Allows for empty values for ID variables, simple list match
if(missing(id1)){id1=NULL}
if(missing(id2)){id2=NULL}
#### lowercase text only
dat1[,string1]<-as.character(tolower(unlist(dat1[,string1])))#force character, if values are factors
dat2[,string2]<-as.character(tolower(unlist(dat2[,string2])))
#Loop through dat1 dataset iteratively. This is a work around to allow for large datasets to be matched
#Can run as long as dat2 dataset fits in memory. Avoids full Cartesian join.
for(i in 1:x) {
d<-merge(dat1[i,c(string1,id1), drop=FALSE],dat2[,c(string2,id2), drop=FALSE])#drop=FALSE to preserve 1var dataframe
#Calculate String Distatnce based method specified "meth"
d$dist <- stringdist(d[,string1],d[,string2], method=meth)
#dedupes A_names selects on the smallest distatnce.
d<- d[order(d[,string1], d$dist, decreasing = FALSE),]
d<- d[!duplicated(d[,string1]),]
#append demos on matched file
matchfile <- rbind(matchfile,d)
# print(paste(round(i/x*100,2),"% complete",sep=''))
}
return(matchfile)
}
# test examole:
names1<-c("Aaliyah",
"Abbey",
"Abby",
"Abi",
"Abia",
"Abigail",
"Adalyn",
"Addison")
dataset1<-data.frame(names1,stringsAsFactors =FALSE)
names2<-c("xAaliyah",
"xAbbey",
"xAbby",
"xAbi",
"xAbia",
"xAbigail",
"xAdalyn",
"xAddison",
"xxAaliyah",
"xxxAbbey",
"xxAbby",
"xxAbi",
"xxAbia",
"xxAbigail",
"xxAdalyn",
"xxAddison")
dataset2<-data.frame(names2,stringsAsFactors =FALSE)
example<-fuzzymatch(dataset1,dataset2,"names1","names2",meth="osa")
head(example)