I am having some trouble importing the following file: http://www.kuleuven.be/bio/ento/temp/test.xlsx into R in the correct encoding. In particular,
library("xlsx")
read.xlsx("test.xlsx",1,header=F,colClasses=c("character"),encoding="UTF-8")
gives me
X1
1 a-cadinol
2 a-calacorene
3 a-caryophyllene alcohol
4 a-curcumene
5 a-elemol
6 a-muurolene
7 a-terpineol acetate
8 ß-4-dimethyl-3-cyclohexane-1-ethanol acetate
9 ß-bisabolene
10 ß-bisabolol
11 ß-bourbonene
12 ß-caryophyllene alcohol
13 ß-cyclocitral
14 ß-farnesol
15 ß-selinene
16 ß-sesquiphellandrene
17 <U+03B3>-cadinene
18 <U+03B3>-Carboethoxy-<U+03B3>-butyrolactone
19 <U+03B3>-ethyl-<U+03B3>-butyrolactone
20 <U+03B3>-eudesmol
21 <U+03B3>-muurolene
22 <U+03B3>-nonalactone
23 <U+03B3>-octalactone
24 <U+03B3>-selinene
25 <U+03B3>-undecalactone
26 d-cadinene
27 d-cadinol
28 d-muurolene
29 d-undecalactone
but the a-
, <U+03B3>-
and d-
should be alpha-
, gamma-
and delta-
Any thoughts on how I could import my file in the correct encoding?
I am working on Windows, and iconvlist()
gives me
[1] "437" "850" "852" "855" "857"
[6] "860" "861" "862" "863" "865"
[11] "866" "869" "ANSI_X3.4-1968" "ANSI_X3.4-1986" "ASCII"
[16] "ASMO-708" "BIG-5" "BIG-FIVE" "big5" "BIG5"
[21] "big5-hkscs" "BIG5-HKSCS" "big5hkscs" "BIG5HKSCS" "CP-GR"
[26] "CP-IS" "cp1025" "CP1125" "CP1133" "CP1200"
[31] "CP12000" "CP12001" "CP1201" "CP1250" "CP1251"
[36] "CP1252" "CP1253" "CP1254" "CP1255" "CP1256"
[41] "CP1257" "CP1258" "CP1361" "CP154" "CP367"
[46] "CP437" "CP50221" "CP51932" "CP65001" "CP737"
[51] "CP775" "CP819" "CP850" "CP852" "CP853"
[56] "CP855" "CP857" "CP858" "CP860" "CP861"
[61] "CP862" "CP863" "CP864" "CP865" "cp866"
[66] "CP866" "CP869" "CP874" "cp875" "CP932"
[71] "CP936" "CP949" "CP950" "CSASCII" "CSIBM855"
[76] "CSIBM857" "CSIBM860" "CSIBM861" "CSIBM863" "CSIBM864"
[81] "CSIBM865" "CSIBM866" "CSIBM869" "csISO2022JP" "CSISOLATIN1"
[86] "CSPC775BALTIC" "CSPC850MULTILINGUAL" "CSPC862LATINHEBREW" "CSPC8CODEPAGE437" "CSPCP852"
[91] "CSPTCP154" "CSWINDOWS31J" "CYRILLIC-ASIAN" "DOS-720" "DOS-862"
[96] "EUC-CN" "euc-jp" "euc-kr" "EUC-KR" "EUCCN"
[101] "eucjp" "euckr" "GB18030" "gb2312" "GBK"
[106] "hz-gb-2312" "IBM-CP1133" "IBM-Thai" "IBM00858" "IBM00924"
[111] "IBM01047" "IBM01140" "IBM01141" "IBM01142" "IBM01143"
[116] "IBM01144" "IBM01145" "IBM01146" "IBM01147" "IBM01148"
[121] "IBM01149" "IBM037" "IBM1026" "IBM273" "IBM277"
[126] "IBM278" "IBM280" "IBM284" "IBM285" "IBM290"
[131] "IBM297" "IBM367" "IBM420" "IBM423" "IBM424"
[136] "IBM437" "IBM437" "IBM500" "ibm737" "ibm775"
[141] "IBM775" "IBM819" "ibm850" "IBM850" "ibm852"
[146] "IBM852" "IBM855" "IBM855" "ibm857" "IBM857"
[151] "IBM860" "IBM860" "ibm861" "IBM861" "IBM862"
[156] "IBM863" "IBM863" "IBM864" "IBM864" "IBM865"
[161] "IBM865" "IBM866" "ibm869" "IBM869" "IBM870"
[166] "IBM871" "IBM880" "IBM905" "iso-2022-jp" "iso-2022-jp"
[171] "ISO-2022-JP" "ISO-2022-JP-MS" "iso-2022-kr" "ISO-8859-1" "iso-8859-13"
[176] "iso-8859-15" "iso-8859-2" "iso-8859-3" "iso-8859-4" "iso-8859-5"
[181] "iso-8859-6" "iso-8859-7" "iso-8859-8" "iso-8859-8-i" "iso-8859-9"
[186] "ISO-IR-100" "ISO-IR-6" "ISO_646.IRV:1991" "ISO_8859-1" "ISO_8859-1:1987"
[191] "ISO2022-JP" "ISO2022-JP-MS" "iso2022-kr" "ISO646-US" "iso8859-1"
[196] "ISO8859-1" "iso8859-13" "iso8859-15" "iso8859-2" "iso8859-3"
[201] "iso8859-4" "iso8859-5" "iso8859-6" "iso8859-7" "iso8859-8"
[206] "iso8859-8-i" "iso8859-9" "Johab" "JOHAB" "koi8-r"
[211] "koi8-u" "ks_c_5601-1987" "L1" "latin-9" "LATIN1"
[216] "latin2" "latin3" "latin4" "latin5" "latin7"
[221] "latin9" "mac" "mac-centraleurope" "mac-is" "macarabic"
[226] "maccentraleurope" "maccroatian" "maccyrillic" "macgreek" "machebrew"
[231] "maciceland" "macintosh" "macis" "macroman" "macromania"
[236] "macthai" "macturkish" "macukraine" "macukrainian" "MS-ANSI"
[241] "MS-ARAB" "MS-CYRL" "MS-EE" "MS-GREEK" "MS-HEBR"
[246] "MS-TURK" "MS50221" "MS51932" "MS932" "MS936"
[251] "PT154" "PTCP154" "SHIFFT_JIS" "SHIFFT_JIS-MS" "shift-jis"
[256] "shift_jis" "SJIS" "SJIS-MS" "SJIS-OPEN" "SJIS-WIN"
[261] "UCS-2" "UCS-2BE" "UCS-2LE" "UCS-4" "UCS-4BE"
[266] "UCS-4BE" "UCS-4LE" "UCS-4LE" "UCS2" "UCS2BE"
[271] "UCS2LE" "UCS4" "UCS4BE" "UCS4LE" "UHC"
[276] "unicodeFFFE" "US" "US-ASCII" "UTF-16" "UTF-16BE"
[281] "UTF-16LE" "UTF-32" "UTF-32BE" "UTF-32LE" "UTF-8"
[286] "UTF16" "UTF16BE" "UTF16LE" "UTF32" "UTF32BE"
[291] "UTF32LE" "UTF8" "WINBALTRIM" "windows-1250" "windows-1251"
[296] "windows-1252" "windows-1253" "windows-1254" "windows-1255" "windows-1256"
[301] "windows-1257" "windows-1258" "WINDOWS-31J" "WINDOWS-50221" "WINDOWS-51932"
[306] "windows-874" "WINDOWS-932" "WINDOWS-936" "x-Chinese_CNS" "x-cp20001"
[311] "x-cp20003" "x-cp20004" "x-cp20005" "x-cp20261" "x-cp20269"
[316] "x-cp20936" "x-cp20949" "x-cp50227" "x-EBCDIC-KoreanExtended" "x-Europa"
[321] "x-IA5" "x-IA5-German" "x-IA5-Norwegian" "x-IA5-Swedish" "x-iscii-as"
[326] "x-iscii-be" "x-iscii-de" "x-iscii-gu" "x-iscii-ka" "x-iscii-ma"
[331] "x-iscii-or" "x-iscii-pa" "x-iscii-ta" "x-iscii-te" "x-mac-arabic"
[336] "x-mac-ce" "x-mac-chinesesimp" "x-mac-chinesetrad" "x-mac-croatian" "x-mac-cyrillic"
[341] "x-mac-greek" "x-mac-hebrew" "x-mac-icelandic" "x-mac-japanese" "x-mac-korean"
[346] "x-mac-romanian" "x-mac-thai" "x-mac-turkish" "x-mac-ukrainian" "x_Chinese-Eten"
I tried with many of these, to no avail... Unfortunately, I also don't know what encoding Excel saved my file in...
Also, is there any easy function in R that would allow me to convert all Greek alpha, beta, gamma and delta's (in original encoding) to "alpha", "beta", "gamma" and "delta" (ie written out in full)? Or to do the reverse, ie convert "alpha", "beta", "gamma" etc written out in full to single Greek characters?
EDIT: regarding my last question I tried
togreek=function(compname) {
n=as.character(compname,encoding="UTF-8")
n=gsub("alpha","\u03B1",n)
n=gsub("beta","\u03B2",n)
n=gsub("gamma","\u03B3",n)
n=gsub("delta","\u03B4",n)
n=gsub("epsilon","\u03B5",n)
n
}
tolatin=function(compname) {
n=as.character(compname,encoding="UTF-8")
n=gsub("\u03B1","alpha",n)
n=gsub("\u03B2","beta",n)
n=gsub("\u03B3","gamma",n)
n=gsub("\u03B4","delta",n)
n=gsub("\u03B5","epsilon",n)
n
}
The function tolatin seems to work:
library("xlsx")
test=read.xlsx("test.xlsx",1,header=F,colClasses=c("character"),encoding="UTF-8")
tolatin(test$X1)
[1] "alpha-cadinol" "alpha-calacorene" "alpha-caryophyllene alcohol"
[4] "alpha-curcumene" "alpha-elemol" "alpha-muurolene"
[7] "alpha-terpineol acetate" "beta-4-dimethyl-3-cyclohexane-1-ethanol acetate" "beta-bisabolene"
[10] "beta-bisabolol" "beta-bourbonene" "beta-caryophyllene alcohol"
[13] "beta-cyclocitral" "beta-farnesol" "beta-selinene"
[16] "beta-sesquiphellandrene" "gamma-cadinene" "gamma-Carboethoxy-gamma-butyrolactone"
[19] "gamma-ethyl-gamma-butyrolactone" "gamma-eudesmol" "gamma-muurolene"
[22] "gamma-nonalactone" "gamma-octalactone" "gamma-selinene"
[25] "gamma-undecalactone" "delta-cadinene" "delta-cadinol"
[28] "delta-muurolene" "delta-undecalactone"
But if I then convert back to Greek characters I again run into problems:
togreek(tolatin(test$X1))
[1] "α-cadinol" "α-calacorene" "α-caryophyllene alcohol"
[4] "α-curcumene" "α-elemol" "α-muurolene"
[7] "α-terpineol acetate" "ß-4-dimethyl-3-cyclohexane-1-ethanol acetate" "ß-bisabolene"
[10] "ß-bisabolol" "ß-bourbonene" "ß-caryophyllene alcohol"
[13] "ß-cyclocitral" "ß-farnesol" "ß-selinene"
[16] "ß-sesquiphellandrene" "<U+03B3>-cadinene" "<U+03B3>-Carboethoxy-<U+03B3>-butyrolactone"
[19] "<U+03B3>-ethyl-<U+03B3>-butyrolactone" "<U+03B3>-eudesmol" "<U+03B3>-muurolene"
[22] "<U+03B3>-nonalactone" "<U+03B3>-octalactone" "<U+03B3>-selinene"
[25] "<U+03B3>-undecalactone" "d-cadinene" "d-cadinol"
[28] "d-muurolene" "d-undecalactone"
Any thoughts what I am doing wrong?
Try this:
library(stringi)
Apply this to see your file codification
The first one works for me.
Then apply the result in the
read.csv()
, as the example bellow:Try this:
Sys.setlocale(category = "LC_ALL", locale = "Greek")