如何合并具有两个公共列的多个文件,并将添加的列命名为文件名?

Deb*_*raj 6 awk bioinformatics bed

我试图.bed通过识别前两列chrstart遵循此来合并多个文件,

将多个文件合并为两个公共列,并将空白替换为0

但是,我想知道如何使文件名成为新添加的列名。

$cat combineFWPS_02.sh

    BEGIN {
       for (k=1; k<ARGC; ++k)
          s = s " " 0
    }
    FNR == 1 {
       ++ARGIND
    }
    {
       key=$1 OFS $2
       if (!(key in map))
          map[key] = s
       split(map[key], a)
       a[ARGIND] = $3
       v = ""
       for (k=1; k<ARGC; ++k)
          v = v " " a[k]
       map[key]=v
    }
    END {
       for (k in map)
          print k map[k]
    }

$cat comRwps_02.sh

awkCOM="~/scripts/combineFWPS_02.sh"
## Run the jobs
time awk -f $awkCOM *.xyz.bed | sort -k1 >  13jLiC.xyz.txt 
Run Code Online (Sandbox Code Playgroud)

输入文件如下所示:

FF85561.xyz.bed:

chr1 111001 234
chr2 22099  108
chr5 463100 219

FF85574.xyz.bed:

chr1 111001 42
chr1 430229 267
chr5 663800 319

FF85631.xyz.bed:

chr1 111001 92
chr3 22099  144
chr5 663800 311

FF85717.xyz.bed:

chr1 111001 129
chr1 157901 79
chr2 22099  442
Run Code Online (Sandbox Code Playgroud)

预期的输出文件将是

$head 13jLiC.xyz.txt

chr    start    FF85561    FF85574    FF85631    FF85717
chr1   111001    234         42          92         129
chr1   157901      0          0           0          79
chr1   430229      0        267           0           0
chr2    22099    108          0           0         442
chr3    22099      0          0         144           0
chr5   463100    219          0           0           0
chr5   663800      0        319         311           0
Run Code Online (Sandbox Code Playgroud)

Rav*_*h13 5

在 GNU 中awk,根据您显示的示例和尝试,请尝试以下awk解决方案。

awk '
BEGIN { OFS="\t" }
FNR==1{
  split(FILENAME,fileName,".")
  files[++count]=fileName[1]
}
{
  baseArr[$1 OFS $2]
  arr1[$1 OFS $2]=(arr1[$1 OFS $2]?arr1[$1 OFS $2] OFS:"") $3
  keyPresense[$1 OFS $2 OFS fileName[1]]=$3
}
END{
  printf("%s ","chr start")
  for(i=1;i<=count;i++){
    printf("%s%s",files[i],i==count?ORS:OFS)
  }
  for(j in baseArr){
    for(k=1;k<=count;k++){
      if(j OFS files[k] in keyPresense){
         foundCount++
      }
    val=(val?val OFS:"") (j OFS files[k] in keyPresense? OFS keyPresense[j OFS files[k]]:"0")
    }
    print j,foundCount==length(files)?arr1[j]:val
    val=foundCount=""
  }
}
' *.bed | column -t | sort -sk1
Run Code Online (Sandbox Code Playgroud)


Ed *_*ton 5

使用任何 awk:

$ cat tst.awk
BEGIN {
    OFS = "\t"
    vals[++numRows,++numCols] = "chr"
    vals[numRows,++numCols] = "start"
}
FNR == 1 {
    val = FILENAME
    sub(/\..*/,"",val)
    vals[1,++numCols] = val
}
{
    key = $1 FS $2
    if ( !(key in key2rowNr) ) {
        key2rowNr[key] = ++numRows
        vals[numRows,1] = $1
        vals[numRows,2] = $2
    }
    rowNr = key2rowNr[key]
    vals[rowNr,numCols] = $3
}
END {
    for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
        for ( colNr=1; colNr<=numCols; colNr++ ) {
            val = ( (rowNr,colNr) in vals ? vals[rowNr,colNr] : 0 )
            row = ( colNr>1 ? row OFS : "" ) val
        }
        print row
    }
}
Run Code Online (Sandbox Code Playgroud)

$ awk -f tst.awk *.bed
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr2    22099   108     0       0       442
chr5    463100  219     0       0       0
chr1    430229  0       267     0       0
chr5    663800  0       319     311     0
chr3    22099   0       0       144     0
chr1    157901  0       0       0       79
Run Code Online (Sandbox Code Playgroud)

如果您希望对行进行排序,那么您可以应用装饰-排序-取消装饰方法:

$ awk -f tst.awk *.bed | awk -v OFS='\t' '{print (NR>1), $0}' | sort -k1,1n -k2,2 -k3,3n | cut -f2-
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr1    157901  0       0       0       79
chr1    430229  0       267     0       0
chr2    22099   108     0       0       442
chr3    22099   0       0       144     0
chr5    463100  219     0       0       0
chr5    663800  0       319     311     0
Run Code Online (Sandbox Code Playgroud)

如果chr<number>字符串末尾可以有超过 1 位数字,并且您希望这些数字按字母顺序和数字顺序排序(例如,chr2在 之前chr10),那么您必须将 DSU 部分更改为如下所示:

$ awk -f tst.awk *.bed | awk -v OFS='\t' '{c=$1; sub(/[[:alpha:]]+/,"&" OFS,c); print (NR>1), c, $0}' | sort -k1,1n -k2,2 -k3,3 -k5,5n | cut -f4-
chr     start   FF85561 FF85574 FF85631 FF85717
chr1    111001  234     42      92      129
chr1    157901  0       0       0       79
chr1    430229  0       267     0       0
chr2    22099   108     0       0       442
chr3    22099   0       0       144     0
chr5    463100  219     0       0       0
chr5    663800  0       319     311     0
Run Code Online (Sandbox Code Playgroud)