R을 이용해 네이버뉴스를 크롤링 했습니다.
library(tm)library(ggplot2)library(KoNLP)useSejongDic()g_data.path <- "./g/a/"ug_data.path <- "./g/b/"test_g_data.path <- "./g/g_test_csv/"test_ug_data.path <- "./g/g_test2_csv/"get.msg <- function(path) {con <- file(path, open="rt")text <- readLines(con, encoding="UTF-8")msg <- text[seq(which(text=="")[1]+1,length(text),1)]msg <- extractNoun(msg)close(con)return(paste(msg, collapse="\n"))}get.tdm <- function(doc.vec) {doc.corpus <- Corpus(VectorSource(doc.vec))control <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, sub='byte'))#doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, sub='byte'))#doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, to='euc-kr', sub='byte'))doc.dtm <- TermDocumentMatrix(doc.corpus, control)return(doc.dtm)}# 유죄 데이터g_data.docs <- dir(g_data.path)all.g_data <- read.delim("g/g_data.csv", sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="")names(all.g_data) <- c("제목", "내용")# 특수문자 제거all.g_data2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.g_data)names(all.g_data2) <- c("제목", "내용")# 명사만 추출all.g_data3 <- extractNoun(all.g_data2)all.g_data <- all.g_data3g_data.tdm <- get.tdm(all.g_data)Encoding(g_data.tdm$dimnames$Terms) = 'UTF-8'g_data.matrix <- as.matrix(g_data.tdm)g_data.counts <- rowSums(g_data.matrix)g_data.df <- data.frame(cbind(names(g_data.counts),as.numeric(g_data.counts)), stringsAsFactors=FALSE)names(g_data.df) <- c("term","frequency")g_data.df$frequency <- as.numeric(g_data.df$frequency)g_data.occurrence <- sapply(1:nrow(g_data.matrix),function(i) {length(which(g_data.matrix[i,] > 0))/ncol(g_data.matrix)})g_data.density <- g_data.df$frequency/sum(g_data.df$frequency)g_data.df <- transform(g_data.df, density=g_data.density,occurrence=g_data.occurrence)#head(g_data.df[with(g_data.df, order(-density)),], 30)# 무죄데이터ug_data.docs <- dir(ug_data.path)all.ug_data <- read.delim("g/ug_data.csv", sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="")names(all.ug_data) <- c("제목", "내용")# 특수문자 제거all.ug_data2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.ug_data)names(all.ug_data2) <- c("제목", "내용")# 명사만 추출all.ug_data3 <- extractNoun(all.ug_data2)all.ug_data <- all.ug_data3ug_data.tdm <- get.tdm(all.ug_data)Encoding(ug_data.tdm$dimnames$Terms) = 'UTF-8'ug_data.matrix <- as.matrix(ug_data.tdm)ug_data.counts <- rowSums(ug_data.matrix)ug_data.df <- data.frame(cbind(names(ug_data.counts),as.numeric(ug_data.counts)), stringsAsFactors=FALSE)names(ug_data.df) <- c("term","frequency")ug_data.df$frequency <- as.numeric(ug_data.df$frequency)ug_data.occurrence <- sapply(1:nrow(ug_data.matrix),function(i) {length(which(ug_data.matrix[i,] > 0))/ncol(ug_data.matrix)})ug_data.density <- ug_data.df$frequency/sum(ug_data.df$frequency)ug_data.df <- transform(ug_data.df, density=ug_data.density,occurrence=ug_data.occurrence)head(ug_data.df[with(ug_data.df, order(-density)),], 30)# 분류기 정의classify.data <- function(path, training.df, prior=0.2, c=1e-6) {all.g_data_test <- read.delim(path, sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="", fileEncoding="euc-kr")names(all.g_data_test) <- c("내용")# 특수문자 제거all.g_data_test2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.g_data_test)# 명사만 추출all.g_data_test3 <- extractNoun(all.g_data_test2)all.g_data_test <- all.g_data_test3msg <- all.g_data_testmsg.tdm <- get.tdm(msg)msg.freq <- rowSums(as.matrix(msg.tdm))msg.match <- intersect(names(msg.freq), training.df$term)if(length(msg.match) < 1) {return(prior*c^(length(msg.freq)))}else {match.probs <- training.df$occurrence[match(msg.match, training.df$term)]return(prior * prod(match.probs) * c^(length(msg.freq)-length(msg.match)))}}############################# 유죄, 무죄 검증g.classifier <- function(path) {pr.g <- classify.data(path, g_data.df, prior=0.01)pr.ug <- classify.data(path, ug_data.df, prior=0.99)return(c(pr.g, pr.ug, ifelse(pr.g > pr.ug, 1, 0)))}test_g_data.docs <- dir(test_g_data.path)test_g2_data.docs <- dir(test_ug_data.path)test_g_data.class <- suppressWarnings(lapply(test_g_data.docs,function(p){g.classifier(file.path(test_g_data.path, p))}))test_g2_data.class <- suppressWarnings(lapply(test_g2_data.docs,function(p){g.classifier(file.path(test_ug_data.path, p))}))test_g_data.matrix <- do.call(rbind, test_g_data.class)test_g_data.final <- cbind(test_g_data.matrix, "GUILT")test_g2_data.matrix <- do.call(rbind, test_g2_data.class)test_g2_data.final <- cbind(test_g2_data.matrix, "INNOCENCE")class.matrix <- rbind(test_g_data.final, test_g2_data.final)class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)names(class.df) <- c("pr.g" ,"pr.ug", "Class", "Type")class.df$pr.g <- as.numeric(class.df$pr.g)class.df$pr.ug <- as.numeric(class.df$pr.ug)class.df$Class <- as.logical(as.numeric(class.df$Class))class.df$Type <- as.factor(class.df$Type)get.results <- function(bool.vector){results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))return(results)}# Save results as a 2x2 tabletest_g_data.col <- get.results(subset(class.df, Type == "GUILT")$Class)test_g2_data.col <- get.results(subset(class.df, Type == "INNOCENCE")$Class)class.res <- rbind(test_g_data.col, test_g2_data.col)colnames(class.res) <- c("GUILT", "INNOCENCE")print(class.res)# Create final plot of resultsclass.plot <- ggplot(class.df, aes(x = log(pr.g), log(pr.ug))) +geom_point(aes(shape = Type, alpha = 0.5)) +geom_abline(intercept = 0, slope = 1) +scale_shape_manual(values = c("GUILT" = 1,"INNOCENCE" = 2),name = "JUDGMENT Type") +scale_alpha(guide = "none") +xlab("log[Pr(pr.g)]") +ylab("log[Pr(pr.ug)]") +theme_bw() +theme(axis.text.x = element_blank(), axis.text.y = element_blank())ggsave(plot = class.plot,filename = file.path("./", "03_final_classification4.pdf"),height = 10,width = 10)
'대학교 > 3.데이터마이닝' 카테고리의 다른 글
맥주 추천 시스템 (0) | 2019.01.03 |
---|---|
[11장] 소셜 네트워크 분석 (0) | 2019.01.03 |
[10장] kNN: 추천시스템 Recommendation Systems (0) | 2019.01.03 |
[9장] MDS: 미국 상원의원 유사성을 시각적으로 탐색하기 Visually Exploring US Senator Similarity (0) | 2019.01.03 |
[7장] 최적화 Optimization: 암호해독 Breaking Codes (0) | 2019.01.03 |