R access multiple source data

目前測試過的來源如下

  • txt (純文字檔,含以 Tab 分隔檔案)
  • csv (以逗號分隔檔案)
  • mdb, accdb (access)
  • SQL Server (微軟伺服器為主資料庫)
  • xlsx (Excel 2007 以上版本)
  • json (網頁資料交換格式)
  • xml (網頁資料交換格式)
  • MySQL (Oracle 伺服器資料庫)
  • SPSS (統計軟體之一)
  • SAS (統計軟體之一)
  • HTML (網頁內容)

測試環境


因 R 核心會隨著時間不斷更新,常有 package 無法在現行核心下運作的狀況,因此需要注意 R 的運作核心為何。

  • Windows 7 64-bit
  • R-3.0.1 32-bit

TEXT File : TXT(Tab) OR CSV(Comma) file


# READ & WRITE from or to a TXT(Tab) OR CSV(Comma) file
getTxtData <- read.table("xxx.txt",sep="\t",header = TRUE)
getCSVData <- read.table("xxx.csv",sep=",",header = TRUE)
calData <- cbind(as.numeric(getTxtData[,3]),as.numeric(getTxtData[,7]))
savData <- cbind(
  c("xxx","xxx"),
  c(mean(calData[1:181,1]),mean(calData[182:365,1])),
  c(mean(calData[1:181,2]),mean(calData[182:365,2]))
)
colnames(savData) <- c("date","ave_atom","ave_temp")
write.table(
  savData, 
  file="calTxt.txt", 
  quote=FALSE,
  sep="\t",
  col.names=TRUE,
  row.names=FALSE
)
write.table(
  savData, 
  file="calTxt.csv", 
  quote=FALSE,
  sep=",",
  col.names=TRUE,
  row.names=FALSE
)

ACCESS : .mdb, .accdb format BY ODBC as Database


  • First is to set ODBC on "control center"; Win 64 bit: C:\Windows\SysWOW64\odbcad32.exe
# READ & WRITE ACCESS: .mdb, *.accdb format BY ODBC as Database
# fetched data is data.frame and data for inserting into the database must be data.frame
# even id is auto increment, it is necessary to prepare
install.packages("RODBC")
library(RODBC)
conn = odbcConnect("R2ACCESS")
#allTables <- sqlTables(conn)$TABLE_NAME

# analyze data
df = sqlFetch(conn, "cityData")
barplot(
  as.numeric(df[c(1,3,2),4]),
  names.arg=df[c(1,3,2),2],
  col=c("red","green","blue"),
  main="CITY POPULATION",
  xlab="CITY",
  ylab="POPULATION (10 thousand)", 
  legend.text = df[c(1,3,2),3],
)

# insert (or delete) a new data into access db
savData <- matrix(c("4","Kyoto","Japan","1335"),nrow=1)
colnames(savData) <- c("id","name","country","population")
savData <- as.data.frame(savData)
sqlSave(conn, savData, tablename="cityData", rownames=FALSE, append=TRUE, addPK=TRUE)

# update a record
upData <- matrix(c("4","Tokyo"),nrow=1)
colnames(upData) <- c("id","name")
upData <- as.data.frame(upData)
# index means the column for being updated
sqlUpdate(conn, upData, tablename="cityData", index="id")   

close(conn) # close the connection to the file

SQL Server : RODBC 或是 ConnectionString


  • 若是採用 RODBC,First is to set ODBC on "control center"; Win 64 bit: C:\Windows\SysWOW64\odbcad32.exe
# READ SQL SERVER by RODBC
# must establish basic operation in SQL SERVER EXPRESS
install.packages("RODBC")
library(RODBC)

# method.1
# IP-address\SQLEXPRESS,Port is the format used to link the SQL Express
sqlHost <- "IP Address\\SQLEXPRESS,1433"   
sqlDatabase <- "2015_city_online"

# connection string.1 : using different web domains might go error
#dsnString <- "driver={SQL Server};server=%s;database=%s;trusted_connection=true;uid=test;pwd=test1234"

# connection string.2 : use 
# "Initial Catalog=cityData;Persist Security Info=True;" 
# to replace "trusted_connection=true;"
dsnString <- "driver={SQL Server};server=%s;database=%s;Initial Catalog=%s;Persist Security Info=True;uid=uid;pwd=pwd"

dsn <- sprintf(dsnString, sqlHost, sqlDatabase)
dbHandle <- odbcDriverConnect(dsn)

# method.2 : must operate firstly in Control Panel
dbHandle <- odbcConnect("R2SQLEXPR_2",uid="test",pwd="test1234")

# fetched data as data.frame
query <- "SELECT * FROM [Table]"
qyDF <- sqlQuery(dbHandle, query)

# insert a new data into the table
insertQuery <- "insert into [Table](name,country) values('Tokyo','Japan')"
insertDF <- sqlQuery(dbHandle, insertQuery)

# update a record
updateQuery <- "update [Table] set name='kyoto' where Id='4'"
updateDF <- sqlQuery(dbHandle, updateQuery)

# delete a record
deleteQuery <- "delete from [Table] where Id='19'"
deleteDF <- sqlQuery(dbHandle, deleteQuery)

odbcClose(dbHandle)

EXCEL : .xlsx, .xls format by XLS package


# READ EXCEL: .xlsx, .xls format by XLS package
# sheetIndex: start from 1
# data type: data.frame
# default: row 1 (index 0) as column name
install.packages("xlsx")
library(xlsx)
conXls <- read.xlsx("xxx.xls", sheetIndex = 1)
conXlsx <- read.xlsx("xxx.xlsx", sheetIndex = 1)
# write xlsx table (must be data.frame type)
savData <- cbind(conXls[,1],conXls[,2],conXls[,3])
colnames(savData) <- colnames(conXls)[1:3]
getWriteStatus <- write.xlsx(
                     savData, 
                     "ooo.xlsx", 
                     sheetName="weather", 
                     col.names=TRUE, 
                     row.names=FALSE, 
                     append=FALSE, 
                     showNA=TRUE
                   )

JSON : rjson package


# READ JSON by rjson package
# fetched data is data.frame type
install.packages(c("jsonlite","curl"))
library(jsonlite)
library(curl)

# input JSON file as data.frame
loginJSONExample <- 'https://api.github.com/users/hadley/orgs'
transFromJson <- fromJSON(loginJSONExample,flatten = TRUE)

# output JSON text
savData <- toJSON(transFromJson[,1:3])
sink("login.json",append=TRUE)
cat(savData)
sink()

MySQL : DBI


# READ & WRITE MySQL Database by DBI
# remember must open port in Linux
# set all host are accepted by following commands
# mysql> GRANT ALL PRIVILEGES ON *.* TO 'root'@'%'WITH GRANT OPTION;   
# % means all hosts are accepted
# mysql> FLUSH PRIVILEGES;   # refresh memory
install.packages("DBI")
install.packages("RMySQL")
library(DBI)
library(RMySQL)
con <-  dbConnect( 
  RMySQL::MySQL(), 
  username = "user", 
  password = "password", 
  host = "IP address", 
  port = 3306, 
  dbname = "test" 
)

# fetch data as data.frame type
getMySQLData <- dbGetQuery(con, "SELECT * FROM cityData")

# insert data as data.frame type
savData <- matrix(c('shanghai','China'),nrow=1)
colnames(savData) <- c('name','country')
savData <- as.data.frame(savData)
getInsertMySQLStatus <- dbWriteTable(con, name="cityData", value=savData, field.types=list(name="varchar(75)",country="varchar(75)"), row.names=FALSE, append=TRUE)

# update data by doGetQuery function
updateQuery <- "update cityData set name='SHANGHAI' where id='4'"
getUpdateStatus <- dbGetQuery(con, updateQuery)

# delete data by doGetQuery function
deleteQuery <- "delete from cityData where id='4'"
getDeleteQuery <- dbGetQuery(con, deleteQuery)

dbDisconnect(con)

SPSS : by foreign package


# READ SPSS File By foreign package
install.packages("foreign")
library("foreign")
getSPSSData <- read.spss("p004.sav", to.data.frame=TRUE)

SAS : by sas7bdat package


# READ SAS File by sas7bdat package
install.packages("sas7bdat")
library("sas7bdat")
getSASData <- read.sas7bdat("help.sas7bdat", debug=FALSE)

HTML : by RCurl, XML Packages


# READ HTML
install.packages("RCurl")
install.packages("XML")
library("RCurl")
library("XML")

# method.1 get html content
doc <- "<html><body>This is some random text.<p>This is some text in a paragraph.</p><p>This is a statement which says that 2 < 3 = TRUE, 4 < 5 = TRUE and 10 > 9 = TRUE.</p></body></html>"
pattern <- "</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>"
plain.text <- gsub(pattern, "\\1", doc)

# method.2
doc.html = htmlTreeParse('http://apiolaza.net/babel.html', useInternal = TRUE)
doc.text = unlist(xpathApply(doc.html, '//p', xmlValue))
doc.text = gsub('\\n', ' ', doc.text)

# method.3 analyze content by regular expression
html <- getURL("http://edition.cnn.com/", followlocation = TRUE)
doc <- htmlParse(html, asText=TRUE)
plain.text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)

XML : by RCurl, XML Packages

# ----------
# READ XML as list (after xmlChildren() function)
# use index to access different nodes
install.packages("RCurl")
install.packages("XML")
library("RCurl")
library("XML")
getXMLData = xmlParse("http://IP-address:Port/example.xml")
root = xmlRoot(getXMLData)
child = xmlChildren(root)

# parsing method.1
getNode <- child$url   # url must be unique

# parsing method.2 by the index
allNodes <- child[1:length(child)]
allContents <- c()
for(i in seq(1,length(child),1)) {
  # allContents are also the list-type
  allContents <- c(allContents,child[i][[1]][[1]][[1]])
}

# write out xml stream
savXMLData = newXMLNode("city")

# node.1
detail <- newXMLNode("detail", parent = savXMLData)
newXMLNode("name", attrs = c(loc = 'asia'), 'Beijing', parent = detail)
newXMLNode("country", "China", parent = detail)

# node.2
detail <- newXMLNode("detail", parent = savXMLData)
newXMLNode("name", attrs = c(loc = 'america'), 'New York',parent = detail)
newXMLNode("country", "USA", parent = detail)

savXMLData   # end the XML

saveXML(
  savXMLData, 
  file="output.xml", 
  compression=0, 
  indent=TRUE, 
  prefix = '<?xml version="1.0"?>\n', doctype = NULL
)
# ----------

results matching ""

    No results matching ""