I'm searching for a library similar in functionality to the Perl Lingua::EN::NameParse module. Essentially, I'd like to parse strings like 'Mr. Bob R. Smith' into prefix, first name, last name, and name suffix components. Google hasn't been much help in finding something like this and I'd prefer not to roll my own if possible. Anyone know of a OSS Java library that can do this in a sophisticated way?
问题:
回答1:
I just can't believe someone hasn't shared a library for this - well I looked in github and there's a javascript name parser that could be easily translated to java: https://github.com/joshfraser/JavaScript-Name-Parser
I also modified the code in one of the answers to work a little better and have included a test case:
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
public class NameParser {
private String firstName = "";
private String lastName = "";
private String middleName = "";
private List<String> middleNames = new ArrayList<String>();
private List<String> titlesBefore = new ArrayList<String>();
private List<String> titlesAfter = new ArrayList<String>();
private String[] prefixes = { "dr", "mr", "ms", "atty", "prof", "miss", "mrs" };
private String[] suffixes = { "jr", "sr", "ii", "iii", "iv", "v", "vi", "esq", "2nd", "3rd", "jd", "phd",
"md", "cpa" };
public NameParser() {
}
public NameParser(String name) {
parse(name);
}
private void reset() {
firstName = lastName = middleName = "";
middleNames = new ArrayList<String>();
titlesBefore = new ArrayList<String>();
titlesAfter = new ArrayList<String>();
}
private boolean isOneOf(String checkStr, String[] titles) {
for (String title : titles) {
if (checkStr.toLowerCase().startsWith(title))
return true;
}
return false;
}
public void parse(String name) {
if (StringUtils.isBlank(name))
return;
this.reset();
String[] words = name.split(" ");
boolean isFirstName = false;
for (String word : words) {
if (StringUtils.isBlank(word))
continue;
if (word.charAt(word.length() - 1) == '.') {
if (!isFirstName && !this.isOneOf(word, prefixes)) {
firstName = word;
isFirstName = true;
} else if (isFirstName) {
middleNames.add(word);
} else {
titlesBefore.add(word);
}
} else {
if (word.endsWith(","))
word = StringUtils.chop(word);
if (isFirstName == false) {
firstName = word;
isFirstName = true;
} else {
middleNames.add(word);
}
}
}
if (middleNames.size() > 0) {
boolean stop = false;
List<String> toRemove = new ArrayList<String>();
for (int i = middleNames.size() - 1; i >= 0 && !stop; i--) {
String str = middleNames.get(i);
if (this.isOneOf(str, suffixes)) {
titlesAfter.add(str);
} else {
lastName = str;
stop = true;
}
toRemove.add(str);
}
if (StringUtils.isBlank(lastName) && titlesAfter.size() > 0) {
lastName = titlesAfter.get(titlesAfter.size() - 1);
titlesAfter.remove(titlesAfter.size() - 1);
}
for (String s : toRemove) {
middleNames.remove(s);
}
}
}
public String getFirstName() {
return firstName;
}
public String getLastName() {
return lastName;
}
public String getMiddleName() {
if (StringUtils.isBlank(this.middleName)) {
for (String name : middleNames) {
middleName += (name + " ");
}
middleName = StringUtils.chop(middleName);
}
return middleName;
}
public List<String> getTitlesBefore() {
return titlesBefore;
}
public List<String> getTitlesAfter() {
return titlesAfter;
}
}
Test case:
import junit.framework.Assert;
import org.junit.Test;
public class NameParserTest {
private class TestData {
String name;
String firstName;
String lastName;
String middleName;
public TestData(String name, String firstName, String middleName, String lastName) {
super();
this.name = name;
this.firstName = firstName;
this.lastName = lastName;
this.middleName = middleName;
}
}
@Test
public void test() {
TestData td[] = { new TestData("Henry \"Hank\" J. Fasthoff IV", "Henry", "\"Hank\" J.", "Fasthoff"),
new TestData("April A. (Caminez) Bentley", "April", "A. (Caminez)", "Bentley"),
new TestData("fff lll", "fff", "", "lll"),
new TestData("fff mmmmm lll", "fff", "mmmmm", "lll"),
new TestData("fff mmm1 mm2 lll", "fff", "mmm1 mm2", "lll"),
new TestData("Mr. Dr. Tom Jones", "Tom", "", "Jones"),
new TestData("Robert P. Bethea Jr.", "Robert", "P.", "Bethea"),
new TestData("Charles P. Adams, Jr.", "Charles", "P.", "Adams"),
new TestData("B. Herbert Boatner, Jr.", "B.", "Herbert", "Boatner"),
new TestData("Bernard H. Booth IV", "Bernard", "H.", "Booth"),
new TestData("F. Laurens \"Larry\" Brock", "F.", "Laurens \"Larry\"", "Brock"),
new TestData("Chris A. D'Amour", "Chris", "A.", "D'Amour") };
NameParser bp = new NameParser();
for (int i = 0; i < td.length; i++) {
bp.parse(td[i].name);
Assert.assertEquals(td[i].firstName, bp.getFirstName());
Assert.assertEquals(td[i].lastName, bp.getLastName());
Assert.assertEquals(td[i].middleName, bp.getMiddleName());
}
}
}
回答2:
Maybe you could try the GATE named entity extraction component? It has build in jape grammar and gazetteer lists to extract first names, last names etc. among other things. See this page.
回答3:
Apache Commons has the HumanNameParser class.
https://commons.apache.org/sandbox/commons-text/jacoco/org.apache.commons.text.names/HumanNameParser.java.html
Name nextName = parser.parse("James C. ('Jimmy') O'Dell, Jr.")
String firstName = nextName.getFirstName();
String nickname = nextName.getNickName();
回答4:
This simple code can help :
import java.util.ArrayList;
import java.util.List;
public class NamesConverter {
private List<String> titlesBefore = new ArrayList<>();
private List<String> titlesAfter = new ArrayList<>();
private String firstName = "";
private String lastName = "";
private List<String> middleNames = new ArrayList<>();
public NamesConverter(String name) {
String[] words = name.split(" ");
boolean isTitleAfter = false;
boolean isFirstName = false;
int length = words.length;
for (String word : words) {
if (word.charAt(word.length() - 1) == '.') {
if (isTitleAfter) {
titlesAfter.add(word);
} else {
titlesBefore.add(word);
}
} else {
isTitleAfter = true;
if (isFirstName == false) {
firstName = word;
isFirstName = true;
} else {
middleNames.add(word);
}
}
}
if (middleNames.size() > 0) {
lastName = middleNames.get(middleNames.size() - 1);
middleNames.remove(lastName);
}
}
public List<String> getTitlesBefore() {
return titlesBefore;
}
public List<String> getTitlesAfter() {
return titlesAfter;
}
public String getFirstName() {
return firstName;
}
public String getLastName() {
return lastName;
}
public List<String> getMiddleNames() {
return middleNames;
}
@Override
public String toString() {
String text = "Titles before :" + titlesBefore.toString() + "\n"
+ "First name :" + firstName + "\n"
+ "Middle names :" + middleNames.toString() + "\n"
+ "Last name :" + lastName + "\n"
+ "Titles after :" + titlesAfter.toString() + "\n";
return text;
}
}
For example this input :
NamesConverter ns = new NamesConverter("Mr. Dr. Tom Jones");
NamesConverter ns1 = new NamesConverter("Ing. Tom Ridley Bridley Furthly Murthly Jones CsC.");
System.out.println(ns);
System.out.println(ns1);
Has this output :
Titles before :[Mr., Dr.]
First name :Tom
Middle names :[]
Last name :Jones
Titles after :[]
Titles before :[Ing.]
First name :Tom
Middle names :[Ridley, Bridley, Furthly, Murthly]
Last name :Jones
Titles after :[CsC.]
回答5:
Personally, I would opt for regular expressions. Here's a good intro. They're fast, concise and always do what you want.
If you want to stay within the boundaries of the java sdk, use String tokenizers.
A bit more low-level is JavaCC, a java based parser generator. Here's a link to a tutorial.
An alternative to javaCC is ANTLR, which I've personally had good experiences with.