【原】dplyr總結篇

醫(yī)科研 2021-01-25

展開全文

歡迎來到醫(yī)科研，這里是白介素2的讀書筆記，跟我一起聊臨床與科研的故事, 生物醫(yī)學數據挖掘，R語言，TCGA、GEO數據挖掘。

dplyr-總結

有必要對dplyr進行一個總結

對行處理

數據處理對于行的處理，我們也稱為觀測。主要包括：filter和arrange
-filter用于篩選行: m %in% (1:10)用法比較重要
-arrange用于排列行， desc()用于設定降序排列，這一點與sort函數類似
slice用于刪減行，可以按位置索引進行刪減

舉基因表達矩陣的例子來說明更生動

library(tidyverse)
load("expma.Rdata")
head(expma)
##
  GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
## 1007_s_at 15630.200 17048.800 13667.500 15138.800 10766.600 15680.800
## 1053_at    3614.400  3563.220  2604.650  1945.710  3371.290  3406.660
## 117_at     1032.670  1164.150   510.692  5061.200   452.166   400.477
## 121_at     5917.800  6826.670  4562.440  5870.130  3869.480  3680.440
## 1255_g_at   224.525   395.025   207.087   164.835   111.609   130.123
## 1294_at     799.786   839.787   592.434   593.632   431.526   332.962
##

dim(expma)
data<-na.omit(expma) %>% 
   as.data.frame() %>%  #轉換為數據框
   rownames_to_column("ID") %>% # 行轉列并命名為ID
   as_tibble() 

## filter
data %>% 
  ## filter只篩選TRUE
  filter(GSM188013==3614.400) #篩選出某個變量的表達值為xx
##
## # A tibble: 1 x 7
##   ID      GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
##   <chr>       <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 1053_at     3614.     3563.     2605.     1946.     3371.     3407.
##

data %>% 
  ## filter只篩選TRUE
  filter(!duplicated(ID)) %>% 
  ## count類似于table的計數
  count(!duplicated(ID))

## arrange
data %>% 
  filter(!is.na(ID)) %>% 
  ## 按GSM188013的表達值降序排列
  arrange(desc(GSM188013)) %>% 
  ## 選擇1:nrow行
  slice(1:n()) %>% 
  slice(-10:-n()) %>% ##刪去10:n行
  slice(1:9)
##
## # A tibble: 9 x 7
##   ID          GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 GSM188022
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
## 1 211542_x_at    115359   112557     95911.   115259    104002    119307 
## 2 212869_x_at    111036    98746     86384.   109036     94431.   101598 
## 3 201429_s_at    110207   106494     77982.   104826     86286.    95794.
## 4 200801_x_at    107297   103409     81221     88992.    80843.   105781 
## 5 200817_x_at    107276   100417     87705     94664.    89916.   106468 
## 6 207783_x_at    106812   103959     78483.    99711.    88342.    98408.
## 7 212661_x_at    104459   104613     86729.   106335     98321.   109809 
## 8 217740_x_at    104423    97527.    79070.   101645     85800.    91210.
## 9 201257_x_at    104330   103574     82088.    98103.    99495.   105121
##

對列處理

數據處理時，經常需要選擇自己感興趣的列，我們也叫變量。主要包括 select 和 mutate
select函數可以做到篩選列，幾乎可以做到所有篩選，對大批量數據還可以應用正則匹配
select可以篩選，也可以反向選擇刪除一些變量，starts_with(),ends_with(),contains() matchs(輔助)，容易忽略的是select 配合everything()可以把自己感興趣的變量移動到前面去

# 為了方便把列名修改了一下
colnames(data)[2:ncol(data)]<-str_c("A",c(1:6),sep = "")
data %>% 
  select(1:6) %>% 
  select(ID,A5,everything()) %>% ##把A5放到前面來
  select(starts_with("A")) %>% ##篩選以A開頭的列
  mutate(A6=mean(A1+A2)) %>%  ##增加一個新列
  transmute(A2=mean(A1+A2))##只保留新列
  ##
## # A tibble: 22,283 x 1
##       A2
##    <dbl>
##  1 6910.
##  2 6910.
##  3 6910.
##  4 6910.
##  5 6910.
##  6 6910.
##  7 6910.
##  8 6910.
##  9 6910.
## 10 6910.
## # ... with 22,273 more rows
##

分組摘要

對于數據的分析解釋常需要對其進行分組計算
主要的函數有group_by 與 summarize

data %>% 
  summarize(x=mean(A1),na.rm=TRUE)##用處不大
##
## # A tibble: 1 x 2
##       x na.rm
##   <dbl> <lgl>
## 1 3459. TRUE
##
## 加載上probe信息
load("probe.Rdata")
head(probe)
##
##          ID Gene Symbol ENTREZ_GENE_ID
## 2   1053_at        RFC2           5982
## 3    117_at       HSPA6           3310
## 4    121_at        PAX8           7849
## 5 1255_g_at      GUCA1A           2978
## 7   1316_at        THRA           7067
## 8   1320_at      PTPN21          11099
##
head(data)
##
## # A tibble: 6 x 7
##   ID            A1     A2     A3     A4     A5     A6
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 1007_s_at 15630. 17049. 13668. 15139. 10767. 15681.
## 2 1053_at    3614.  3563.  2605.  1946.  3371.  3407.
## 3 117_at     1033.  1164.   511.  5061.   452.   400.
## 4 121_at     5918.  6827.  4562.  5870.  3869.  3680.
## 5 1255_g_at   225.   395.   207.   165.   112.   130.
## 6 1294_at     800.   840.   592.   594.   432.   333.
##
data<-data %>% 
  inner_join(probe,by="ID") %>% ## 合并probe
  rename(genename= 'Gene Symbol',geneid=ENTREZ_GENE_ID) %>% ##修改列名
  select(ID,genename,geneid,everything()) ##調整下順序
head(data)
##
## # A tibble: 6 x 9
##   ID        genename geneid    A1    A2    A3    A4    A5    A6
##   <chr>     <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1053_at   RFC2     5982   3614. 3563. 2605. 1946. 3371. 3407.
## 2 117_at    HSPA6    3310   1033. 1164.  511. 5061.  452.  400.
## 3 121_at    PAX8     7849   5918. 6827. 4562. 5870. 3869. 3680.
## 4 1255_g_at GUCA1A   2978    225.  395.  207.  165.  112.  130.
## 5 1316_at   THRA     7067    722.  934.  455.  592.  350.  525.
## 6 1320_at   PTPN21   11099   135.  100.  353.  230.  230.  341.
##
dim(data)## 20878個gene

## group_by函數聯(lián)合 summarize分組摘要
data %>% 
  group_by(genename) %>% ##會分組，但并不計算
  summarize(
    count=n(),##計數分組有多少
    a1=mean(A1,na.rm = T)##這樣得到的是只有1列a1，這樣就不適用于對所有樣本分組摘要了
  )

##實現(xiàn)處理多個探針對應一個基因求平均值
data %>% 
  select(genename,A1:A6) %>% ##篩選出需要的變量，注意不能有多個分類變量
  group_by(genename) %>% ##會分組，但并不計算
  summarise_all(mean)##mean可改為其它函數即可
##
## # A tibble: 12,549 x 7
##    genename     A1     A2     A3     A4     A5     A6
##    <chr>     <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 ""       1335.  1207.   756.   936.   800.   849. 
##  2 A1CF     2017.  1569.   982.  1363.   856.   839. 
##  3 A2M        93.7   84.8  282.    49.3   45.9   46.9
##  4 A4GALT    211.   225.   409.   131.    59.9  383. 
##  5 A4GNT    1211.  1161.   671.   439.   690.   572. 
##  6 AAAS      616.   593.  3334.  2370.  1581.  1714. 
##  7 AACS     5697.  5413.  4494.  4389.  4659.  4447. 
##  8 AADAC      89.0   57.4   30.3   58.7   51.6   47.7
##  9 AAGAB    1895.  1824.  2340.  2128.  2777.  2921. 
## 10 AAK1      419.   484.   309.   520.   389.   446. 
## # ... with 12,539 more rows
##
## 示例
iris %>%
  group_by(Species) %>% 
  summarise_all(mean)##同時對所有變量進行分組摘要，并不需要命名新變量
##
## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03
##
## 等同于上式，但命名更直觀，這種方式可以同時進行多種分組摘要
iris %>%
  group_by(Species) %>% 
  summarise_all(list(~min(.), ~max(.)))##
##
## # A tibble: 3 x 9
##   Species Sepal.Length_min Sepal.Width_min Petal.Length_min Petal.Width_min
##   <fct>              <dbl>           <dbl>            <dbl>           <dbl>
## 1 setosa               4.3             2.3              1               0.1
## 2 versic~              4.9             2                3               1  
## 3 virgin~              4.9             2.2              4.5             1.4
## # ... with 4 more variables: Sepal.Length_max <dbl>,
## #   Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>
##
## summarise 對指定的向量處理  
starwars %>%
  summarise_at(c("height", "mass"), mean, na.rm = TRUE)

group_by與filter聯(lián)用

例如我們要實現(xiàn)找出多于5個探針對應同一個基因的分組聯(lián)用filter可以篩選分組

data %>% 
  group_by(genename) %>% 
  filter(n()>3)##找出3個探針以上對應一個基因
## # A tibble: 4,081 x 9
## # Groups:   genename [662]
##    ID          genename geneid     A1     A2     A3     A4     A5     A6
##    <chr>       <chr>    <chr>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 121_at      PAX8     7849    5918.  6827.  4562.  5870.  3869.  3680.
##  2 1316_at     THRA     7067     722.   934.   455.   592.   350.   525.
##  3 1320_at     PTPN21   11099    135.   100.   353.   230.   230.   341.
##  4 1494_f_at   CYP2A6   1548    1128.  1495.   965.  1567.   647.   731.
##  5 160020_at   MMP14    4323    2415.  2017.  2408.  2618.  1696.  1605.
##  6 177_at      PLD1     5337    1384.  1506.   586.   561.   608.   400.
##  7 200014_s_at HNRNPC   3183   13273. 12936. 15264  13516. 19769. 17326.
##  8 200047_s_at YY1      7528   22706. 23938. 18820. 18727. 22232. 20630.
##  9 200067_x_at SNX3     8724   10194. 10602.  9283.  9883. 11139. 10550.
## 10 200073_s_at HNRNPD   3184   15528  15212. 11880.  9753. 18304  16338.
## # ... with 4,071 more rows

distinct函數

去重函數，可以發(fā)現(xiàn)它們也是有配套的distinct_all等函數

df <- tibble(
  x = sample(10, 100, rep = TRUE),
  y = sample(10, 100, rep = TRUE)
)
head(df)
## # A tibble: 6 x 2
##       x     y
##   <int> <int>
## 1    10     4
## 2     3     3
## 3     6     2
## 4     5     9
## 5     8     9
## 6     3     2
nrow(df)
## [1] 100
##組合x,y
nrow(distinct(df))##刪去重復還有64行
## [1] 68
nrow(distinct(df, x, y))
## [1] 68
##去重某個列
distinct(df, x)##10個
## # A tibble: 10 x 1
##        x
##    <int>
##  1    10
##  2     3
##  3     6
##  4     5
##  5     8
##  6     7
##  7     2
##  8     4
##  9     1
## 10     9
distinct(df, y)## 
## # A tibble: 10 x 1
##        y
##    <int>
##  1     4
##  2     3
##  3     2
##  4     9
##  5     5
##  6     7
##  7    10
##  8     6
##  9     1
## 10     8
# 僅對x去重，其它均保留，這個就適用于去除重復
distinct(df, x, .keep_all = TRUE)
## # A tibble: 10 x 2
##        x     y
##    <int> <int>
##  1    10     4
##  2     3     3
##  3     6     2
##  4     5     9
##  5     8     9
##  6     7     7
##  7     2     4
##  8     4     3
##  9     1     4
## 10     9     9
## distinct_all函數
df <- tibble(x = rep(2:5, each = 2) / 2, y = rep(2:3, each = 4) / 2)
df
## # A tibble: 8 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1     1  
## 3   1.5   1  
## 4   1.5   1  
## 5   2     1.5
## 6   2     1.5
## 7   2.5   1.5
## 8   2.5   1.5
distinct_all(df)##相當于組合x,y
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
distinct_at(df, vars(x,y))
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
distinct_if(df, is.numeric)
## # A tibble: 4 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1.5   1  
## 3   2     1.5
## 4   2.5   1.5
##在distinct前先運用函數處理
distinct_all(df, round)
## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     2     2
arrange_all(df, list(~round(.)))##多個變量組合排序
## # A tibble: 8 x 2
##       x     y
##   <dbl> <dbl>
## 1   1     1  
## 2   1     1  
## 3   1.5   1  
## 4   1.5   1  
## 5   2     1.5
## 6   2     1.5
## 7   2.5   1.5
## 8   2.5   1.5

關于dplyr的內容總結就到這里，經過這樣的模擬訓練總結，對它的認識也會更深一層

贊賞

共11人贊賞

小男孩‘自慰网亚洲一区二区,亚洲一级在线播放毛片,亚洲中文字幕av每天更新,黄aⅴ永久免费无码,91成人午夜在线精品,色网站免费在线观看,亚洲欧洲wwwww在线观看

【原】dplyr總結篇

對行處理

舉基因表達矩陣的例子來說明更生動

對列處理

分組摘要

group_by與filter聯(lián)用

distinct函數