본문 바로가기
대학교/3.데이터마이닝

[논문] 신문기사를 바탕으로 범죄의 유·무죄 예측

by Jcoder 2019. 1. 3.



R을 이용해 네이버뉴스를 크롤링 했습니다.

g.zip




library(tm)
library(ggplot2)
library(KoNLP)
useSejongDic()
g_data.path <- "./g/a/"
ug_data.path <- "./g/b/"
test_g_data.path <- "./g/g_test_csv/"
test_ug_data.path <- "./g/g_test2_csv/"
get.msg <- function(path) {
con <- file(path, open="rt")
text <- readLines(con, encoding="UTF-8")
msg <- text[seq(which(text=="")[1]+1,length(text),1)]
msg <- extractNoun(msg)
close(con)
return(paste(msg, collapse="\n"))
}
get.tdm <- function(doc.vec) {
doc.corpus <- Corpus(VectorSource(doc.vec))
control <- list(stopwords=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, minDocFreq=2)
doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, sub='byte'))
#doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, sub='byte'))
#doc.corpus <- tm_map(doc.corpus, function(x) iconv(x, to='euc-kr', sub='byte'))
doc.dtm <- TermDocumentMatrix(doc.corpus, control)
return(doc.dtm)
}
# 유죄 데이터
g_data.docs <- dir(g_data.path)
all.g_data <- read.delim("g/g_data.csv", sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="")
names(all.g_data) <- c("제목", "내용")
# 특수문자 제거
all.g_data2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.g_data)
names(all.g_data2) <- c("제목", "내용")
# 명사만 추출
all.g_data3 <- extractNoun(all.g_data2)
all.g_data <- all.g_data3
g_data.tdm <- get.tdm(all.g_data)
Encoding(g_data.tdm$dimnames$Terms) = 'UTF-8'
g_data.matrix <- as.matrix(g_data.tdm)
g_data.counts <- rowSums(g_data.matrix)
g_data.df <- data.frame(cbind(names(g_data.counts),as.numeric(g_data.counts)), stringsAsFactors=FALSE)
names(g_data.df) <- c("term","frequency")
g_data.df$frequency <- as.numeric(g_data.df$frequency)
g_data.occurrence <- sapply(1:nrow(g_data.matrix),function(i) {length(which(g_data.matrix[i,] > 0))/ncol(g_data.matrix)})
g_data.density <- g_data.df$frequency/sum(g_data.df$frequency)
g_data.df <- transform(g_data.df, density=g_data.density,occurrence=g_data.occurrence)
#head(g_data.df[with(g_data.df, order(-density)),], 30)
# 무죄데이터
ug_data.docs <- dir(ug_data.path)
all.ug_data <- read.delim("g/ug_data.csv", sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="")
names(all.ug_data) <- c("제목", "내용")
# 특수문자 제거
all.ug_data2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.ug_data)
names(all.ug_data2) <- c("제목", "내용")
# 명사만 추출
all.ug_data3 <- extractNoun(all.ug_data2)
all.ug_data <- all.ug_data3
ug_data.tdm <- get.tdm(all.ug_data)
Encoding(ug_data.tdm$dimnames$Terms) = 'UTF-8'
ug_data.matrix <- as.matrix(ug_data.tdm)
ug_data.counts <- rowSums(ug_data.matrix)
ug_data.df <- data.frame(cbind(names(ug_data.counts),as.numeric(ug_data.counts)), stringsAsFactors=FALSE)
names(ug_data.df) <- c("term","frequency")
ug_data.df$frequency <- as.numeric(ug_data.df$frequency)
ug_data.occurrence <- sapply(1:nrow(ug_data.matrix),function(i) {length(which(ug_data.matrix[i,] > 0))/ncol(ug_data.matrix)})
ug_data.density <- ug_data.df$frequency/sum(ug_data.df$frequency)
ug_data.df <- transform(ug_data.df, density=ug_data.density,occurrence=ug_data.occurrence)
head(ug_data.df[with(ug_data.df, order(-density)),], 30)
# 분류기 정의
classify.data <- function(path, training.df, prior=0.2, c=1e-6) {
all.g_data_test <- read.delim(path, sep=",", stringsAsFactors=FALSE, header=FALSE, na.strings="", fileEncoding="euc-kr")
names(all.g_data_test) <- c("내용")
# 특수문자 제거
all.g_data_test2 <- gsub('[{}~!@#$%^&*()_+=?]<>', '', all.g_data_test)
# 명사만 추출
all.g_data_test3 <- extractNoun(all.g_data_test2)
all.g_data_test <- all.g_data_test3
msg <- all.g_data_test
msg.tdm <- get.tdm(msg)
msg.freq <- rowSums(as.matrix(msg.tdm))
msg.match <- intersect(names(msg.freq), training.df$term)
if(length(msg.match) < 1) {
return(prior*c^(length(msg.freq)))
}
else {
match.probs <- training.df$occurrence[match(msg.match, training.df$term)]
return(prior * prod(match.probs) * c^(length(msg.freq)-length(msg.match)))
}
}
############################# 유죄, 무죄 검증
g.classifier <- function(path) {
pr.g <- classify.data(path, g_data.df, prior=0.01)
pr.ug <- classify.data(path, ug_data.df, prior=0.99)
return(c(pr.g, pr.ug, ifelse(pr.g > pr.ug, 1, 0)))
}
test_g_data.docs <- dir(test_g_data.path)
test_g2_data.docs <- dir(test_ug_data.path)
test_g_data.class <- suppressWarnings(lapply(test_g_data.docs,
function(p)
{
g.classifier(file.path(test_g_data.path, p))
}))
test_g2_data.class <- suppressWarnings(lapply(test_g2_data.docs,
function(p)
{
g.classifier(file.path(test_ug_data.path, p))
}))
test_g_data.matrix <- do.call(rbind, test_g_data.class)
test_g_data.final <- cbind(test_g_data.matrix, "GUILT")
test_g2_data.matrix <- do.call(rbind, test_g2_data.class)
test_g2_data.final <- cbind(test_g2_data.matrix, "INNOCENCE")
class.matrix <- rbind(test_g_data.final, test_g2_data.final)
class.df <- data.frame(class.matrix, stringsAsFactors = FALSE)
names(class.df) <- c("pr.g" ,"pr.ug", "Class", "Type")
class.df$pr.g <- as.numeric(class.df$pr.g)
class.df$pr.ug <- as.numeric(class.df$pr.ug)
class.df$Class <- as.logical(as.numeric(class.df$Class))
class.df$Type <- as.factor(class.df$Type)
get.results <- function(bool.vector)
{
results <- c(length(bool.vector[which(bool.vector == FALSE)]) / length(bool.vector),
length(bool.vector[which(bool.vector == TRUE)]) / length(bool.vector))
return(results)
}
# Save results as a 2x2 table
test_g_data.col <- get.results(subset(class.df, Type == "GUILT")$Class)
test_g2_data.col <- get.results(subset(class.df, Type == "INNOCENCE")$Class)
class.res <- rbind(test_g_data.col, test_g2_data.col)
colnames(class.res) <- c("GUILT", "INNOCENCE")
print(class.res)
# Create final plot of results
class.plot <- ggplot(class.df, aes(x = log(pr.g), log(pr.ug))) +
geom_point(aes(shape = Type, alpha = 0.5)) +
geom_abline(intercept = 0, slope = 1) +
scale_shape_manual(values = c("GUILT" = 1,
"INNOCENCE" = 2),
name = "JUDGMENT Type") +
scale_alpha(guide = "none") +
xlab("log[Pr(pr.g)]") +
ylab("log[Pr(pr.ug)]") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.text.y = element_blank())
ggsave(plot = class.plot,
filename = file.path("./", "03_final_classification4.pdf"),
height = 10,
width = 10)