评论

收藏

[R语言] R语言并行化处理

编程语言 编程语言 发布于:2021-06-29 23:08 | 阅读数:536 | 评论:0

  
基本设定,加载工具包
  setwd(“H:\程序”)
getwd(
options(scipen = 10)
set.seed(“1330111001”)
require(plyr)
install.packages(“geosphere”)
install.packages(“foreach”)
install.packages(“doParallel”)
library(geosphere)
library(foreach)
library(doParallel)

读取数据
  data_2007_cut <- read.csv(“data_usd_new.csv”, header = TRUE)
companydata <- data_2007_cut[, c(5,6,13)]
zipcode <- read.csv(‘postcode_new.csv’, header = TRUE)
Indust <- read.csv(“Industry4.csv”, header = TRUE)

处理缺失值,其比例为 0.01%不到
  companydata <- apply(companydata, 2,
function(x)as.numeric(as.character(x)))
companydata <- na.omit(companydata)

数据匹配
  id.zip <- sort(unique(zipcode[, 1]))
id.all <- sort(unique(companydata[, 1]))
ID <- as.vector(id.all[id.all %in% id.zip])
rownames(zipcode) <- zipcode[, 1]
  companydata <- companydata[companydata[, 1] %in% ID, ]
companydata <- companydata[companydata[, 2] %in% Indust[, 1], ]

处理数据,以行业为分类,将数据转化为 list 对象
  indust4 <- sort(unique(companydata[, 2]))
  companydata.indust.list4 <- list()
indust4.loop <- c()
k = 1
for (j in (1:length(indust4))) {
if(sum(companydata[, 2] == indust4[j]) > 10) {
indust4.loop <- c(indust4.loop, indust4[j])
companydata.indust.list4[[k]] <- companydata[companydata[, 2] ==
indust4[j], c(1,3)]
k = k + 1
}
}
  names(companydata.indust.list4) <- indust4.loop

对数据进行抽样,并重新汇合整理
  number4 <- ldply(companydata.indust.list4, function(x)length(x[,1]))
number4[number4[,2] > 10, ]
  fun.dest <- function(x) {
if (length(x[,1]) <= 2000) d=x else d = x[sample(1:length(x[,1]), 2000),]
return(d)
}
  companydata.indust.list4 <- llply(companydata.indust.list4, fun.dest)
companydata.new <- do.call(“rbind”, companydata.indust.list4)
  Industry_Num= length(companydata.new)/2

计算密度值
  kernal.density <- function(d, x, h, e){
x.new <- (d - x)/h
e<-compangdata.new[,c(3)]
x.f <-dnorm(x.new)
x.f1<-x.fe^2
density.x <- (sum(x.f1)/2)/(sum(e^2)/2
h)
density.x
}

估计全局置信区间函数
  quantile.function <- function(x) {
quantile <- quantile(x, probs = c(0.05, 0.95,0.05, 0.95), na.rm = TRUE)
return(quantile)
}

估计核密度线
  kernel.density.estimate <- function(x) {
x1 <- as.character(x[, 1])
n <- length(x1)
id <- sort(x1)
id.location <- zipcode[id, c(2, 3)]
library(geosphere)
data.dist <- distm(id.location)/1000
dimnames(data.dist) <- list(id, id)
vectorlization  x.vector <- data.dist[upper.tri(data.dist, diag=FALSE)]
optimal bw  h <- 0.9 * min(sd(x.vector), (quantile(x.vector, 0.75) -
quantile(x.vector, 0.25))/1.34)* n^(-0.2)
#compulate density value and curve
loop <- seq(0, 200, length.out = 51)
matrix.x <- t(rbind(loop, sapply(loop,
kernal.density,x=data.dist,h=h,n=n,e=e)))
colnames(matrix.x) <- c(“Km”, “Density”)
matrix.x
}

并行化处理
  cl <- makeCluster(4) # My computer has 2 cores
registerDoParallel(cl)
真实行业集聚的核密度估计
  density.dest.list <- list()
time0 <- Sys.time()
  class(companydata.indust.list4)
as.data.frame(companydata.indust.list4[1])
  density.dest.list <- foreach (i =1:5) %dopar% {
kernel.density.estimate(companydata.indust.list4[1])
}
  time1 <- Sys.time()
time1 - time0
构建模拟行业并重复 1000 次,得出全局和局部置信区间
  set.seed(“1234”)
time0 <- Sys.time()
  companydata.new.small <- companydata.new[companydata.new[,2] == 1, ]
companydata.new.middle <- companydata.new[companydata.new[,2] == 2, ]
companydata.new.large <- companydata.new[companydata.new[,2] == 3, ]
n1 <- length(companydata.new.large[, 1])
n2 <- length(companydata.new.middle[, 1])
n3 <- length(companydata.new.small[, 1])
  quantile.dest.list <- list()
  for (i in ( 1: 5)) {
papa <- companydata.indust.list4[]
n.1 <- length(papa[papa[,2] == 1, 1])
n.2 <- length(papa[papa[,2] == 2, 1])
n.3 <- length(papa[papa[,2] == 3, 1])
iters <- 1000
ID.sample <- foreach (j = 1:iters) %do% {
papa.1 <- companydata.new.large[sample(1:n1, n.1), ]
papa.2 <- companydata.new.middle[sample(1:n2, n.2), ]
papa.3 <- companydata.new.small[sample(1:n3, n.3), ]
rbind(papa.1, papa.2, papa.3)
}
  density.sample.matrix <- foreach(i = 1:iters, .combine = cbind) %dopar%
{
kernel.density.estimate(ID.sample[])[, 2]
}
density.quantile <- t(apply(density.sample.matrix, 1,
quantile.function))
quantile.dest.list[] <- density.quantile
message(paste(i, “done”, " "), appendLF = FALSE)
}
  time1 <- Sys.time()
time1 - time0

将真实核密度值和置信区间值合并
  options(scipen = 10)
require(plyr)
SIC3 <- mapply(cbind, density.dest.list, quantile.dest.list,
SIMPLIFY=FALSE)
names(SIC3) <- indust4.loop
write.csv(SIC3,“SIC3.CSV”)

计算 Tao 值和 Psi 值
  tao.fun <- function(x) {
out <- ifelse(x[, 2] - x[, 5] > 0, x[, 2] - x[, 5], 0)
if (sum(out) == 0) {
put <- apply(x[, c(2,6)], 1, function(w) max(w[2]-w[1], 0))
} else put <- 0
x.new <- data.frame(Km = x[,1], tao = out, psi = put)
x.new
}
  tao.value <- llply(SIC3, tao.fun)

Tao 值
  tao_all <- do.call(“cbind”, llply(tao.value, function(x) x[,2]))
colnames(tao_all) <- indust4.loop
rownames(tao_all) <- as.character(round(seq(0, 200, length.out = 51)))

Psi 值
  psi_all <- do.call(“cbind”, llply(tao.value, function(x) x[,3]))
colnames(psi_all) <- indust4.loop
rownames(psi_all) <- as.character(round(seq(0, 200, length.out = 51)))

以行业为对象集聚程度指标
  tao_psi_industry <- data.frame(tao = apply(tao_all, 2, sum),
psi = apply(psi_all, 2, sum))

以距离为对象集聚程度指标
  tao_psi_distance <- data.frame(tao = apply(tao_all, 1, sum),
psi = apply(psi_all, 1, sum))
  ##空间分布的基本特征
集聚  local_NO <- sum(tao_psi_industry[, 1] > 0)
local_percent <- sum(tao_psi_industry[, 1] > 0)/5
分散  disper_NO <- sum(tao_psi_industry[, 2] > 0)
disper_percent <- sum(tao_psi_industry[, 2] > 0)/5
既不集聚也不分散  behavior_NO <- 5-local_NO-disper_NO
behavior_percent <- behavior_NO/5

绘图参数设置
  par(mfrow=c(1,1))
par(mar=c(4,4,2,1)+0.1, cex = 0.8)

基准分析中集聚绘图
  local_NO_dist <- apply(tao_all, 1, function(x) sum(x > 0))
local_NO_dist_plot <- data.frame(Km = as.numeric(names(local_NO_dist)), NO
= local_NO_dist)
jpeg(file=“集聚绘图”)
print(plot(local_NO_dist_plot$Km, local_NO_dist_plot[, 2], type = “s”, ylab =
“”,xlab = “Distance(km)”, bty= “l”,ylim = c(0, 5)))
dev.off()

分散绘图
  disper_NO_dist <- apply(psi_all, 1, function(x) sum(x > 0))
disper_NO_dist_plot <- data.frame(Km = as.numeric(names(disper_NO_dist)),
NO = disper_NO_dist)
points(disper_NO_dist_plot[, 1], disper_NO_dist_plot[, 2],type = “s”, lty
= 2)

集聚和分散程度绘图
  jpeg(file=“集聚和分散程度绘图”)
print(plot(disper_NO_dist_plot[, 1], tao_psi_distance[, 1], xlab =
“Distance(Km)”,bty = “l”, ylab = “”,bty = “l”,type = “l”, ylim = c(0, 0.03))
points(disper_NO_dist_plot[, 1], tao_psi_distance[, 2], type = “l”, lty =
2))
最集聚或最分散的行业汇总
  tao.industry.sort <- tao_psi_industry[order(tao_psi_industry[,1],
decreasing = TRUE), ]
write.csv(tao.industry.sort, “tao.industry.sort.csv”)
  psi.industry.sort <- tao_psi_industry[order(tao_psi_industry[,2],
decreasing = TRUE), ]
write.csv(psi.industry.sort, “psi.industry.sort.csv”)

  
关注下面的标签,发现更多相似文章