YD's blog

Posted 三 10 2月 2016

DataCleaning

前陣子幫台大圖書館館員寫了一個格式轉換的腳本,應該可以讓他們花更少時間在轉檔的冗事上。

badPub <- c("Cram101 Incorporated",
            "Academic Internet Publishers Incorporated",
            "Cram101 Textbook Outlines",
            "BiblioBazaar",
            "BiblioLife",
            "Blatter Press",
            "General Books LLC",
            "Kessinger Publishing",
            "Nabu Press",
            "The Lawbook Exchange, Ltd."
            )
direction <- c("A",
               "B",
               "C"
               )
fileClean <- function(path,direction="",year="",department="",host=""){

  storage <- data.frame()
  for(fileIdx in 1:length(list.files(path))){



    tempFile <- read.csv(paste(path,"/",list.files(path)[fileIdx],sep=''),header=T,sep=',',row.names = NULL)
    #in case of the wrong header
    if(length(colnames(tempFile))>16)colnames(tempFile) <- colnames(tempFile)[2:17]

    tempFile$Note <- unlist(strsplit(list.files(path)[fileIdx],"[.]"))[1]#add keyword to Note, warnning: 'split' is a regexp
    tempFile <- subset(tempFile,is.na(match(Publisher,badPub))) #tempFile[-which(!is.na(match(tempFile$Publisher,badPub))),]#remove badPub
    tempFile$PriceType <- sapply(1:length(tempFile$Price),function(x){temp <- strsplit(as.character(tempFile$Price[x]),split=' ')[[1]][1];gsub("[^[:alpha:]]","",temp)})#add price type
    tempFile$PriceType[which(tempFile$PriceType=="Contact")] <- ""#Contact to NULL
    tempFile$Price <- sapply(1:length(tempFile$Price),function(x){temp <- strsplit(as.character(tempFile$Price[x]),split=' ')[[1]][1];gsub("[^[:digit:].]","",temp)})#price tranformation, gsub()濾掉不等於digit但保留"."
    tempFile$Publication.Date <- sapply(1:length(tempFile$Publication.Date),function(x)strsplit(as.character(tempFile$Publication.Date[x]),split="[[:punct:]]")[[1]][1])#date transformation
    #remove undesirable cols
    #tempFile <- tempFile[,colSums(!is.na(tempFile))==nrow(tempFile)]
    #add columns
    tempFile$老師自訂方向 <- direction
    tempFile$年度 <- year
    tempFile$主題 <- "topic"
    tempFile$學門 <- department
    tempFile$藏書館 <- "xxxx"
    tempFile$計畫主持 <- host
    tempFile$版次 <- ""
    tempFile$備註 <- ""
    tempFile$來源 <- ""
    #change colnames
    colnames(tempFile)[which(names(tempFile) == "ISBN.13")] <- "ISBN"
    colnames(tempFile)[which(names(tempFile) == "Title")] <- "題名"
    colnames(tempFile)[which(names(tempFile) == "Contributor")] <- "著者"
    colnames(tempFile)[which(names(tempFile) == "Publisher")] <- "出版者"
    colnames(tempFile)[which(names(tempFile) == "Publication.Date")] <- "出版日期"
    colnames(tempFile)[which(names(tempFile) == "Note")] <- "關鍵字"
    colnames(tempFile)[which(names(tempFile) == "PriceType")] <- "幣別"
    colnames(tempFile)[which(names(tempFile) == "Price")] <- "價格"
    #reordering data.frame
    colnames(tempFile)
    tempFile <- tempFile[c("年度","學門","主題","ISBN","題名","版次","著者","出版者","出版日期","藏書館","計畫主持","備註","關鍵字","幣別","價格","來源","老師自訂方向","ISBN.10")]
    storage <- rbind(storage,tempFile)
  }
  return(storage)
}
Category: R
Tags: R