Deb*_*raj 6 awk bioinformatics bed
我试图.bed
通过识别前两列chr
并start
遵循此来合并多个文件,
但是,我想知道如何使文件名成为新添加的列名。
$cat combineFWPS_02.sh
BEGIN {
for (k=1; k<ARGC; ++k)
s = s " " 0
}
FNR == 1 {
++ARGIND
}
{
key=$1 OFS $2
if (!(key in map))
map[key] = s
split(map[key], a)
a[ARGIND] = $3
v = ""
for (k=1; k<ARGC; ++k)
v = v " " a[k]
map[key]=v
}
END {
for (k in map)
print k map[k]
}
$cat comRwps_02.sh
awkCOM="~/scripts/combineFWPS_02.sh"
## Run the jobs
time awk -f $awkCOM *.xyz.bed | sort -k1 > 13jLiC.xyz.txt
Run Code Online (Sandbox Code Playgroud)
输入文件如下所示:
FF85561.xyz.bed:
chr1 111001 234
chr2 22099 108
chr5 463100 219
FF85574.xyz.bed:
chr1 111001 42
chr1 430229 267
chr5 663800 319
FF85631.xyz.bed:
chr1 111001 92
chr3 22099 144
chr5 663800 311
FF85717.xyz.bed:
chr1 111001 129
chr1 157901 79
chr2 22099 442
Run Code Online (Sandbox Code Playgroud)
预期的输出文件将是
$head 13jLiC.xyz.txt
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0
Run Code Online (Sandbox Code Playgroud)
在 GNU 中awk
,根据您显示的示例和尝试,请尝试以下awk
解决方案。
awk '
BEGIN { OFS="\t" }
FNR==1{
split(FILENAME,fileName,".")
files[++count]=fileName[1]
}
{
baseArr[$1 OFS $2]
arr1[$1 OFS $2]=(arr1[$1 OFS $2]?arr1[$1 OFS $2] OFS:"") $3
keyPresense[$1 OFS $2 OFS fileName[1]]=$3
}
END{
printf("%s ","chr start")
for(i=1;i<=count;i++){
printf("%s%s",files[i],i==count?ORS:OFS)
}
for(j in baseArr){
for(k=1;k<=count;k++){
if(j OFS files[k] in keyPresense){
foundCount++
}
val=(val?val OFS:"") (j OFS files[k] in keyPresense? OFS keyPresense[j OFS files[k]]:"0")
}
print j,foundCount==length(files)?arr1[j]:val
val=foundCount=""
}
}
' *.bed | column -t | sort -sk1
Run Code Online (Sandbox Code Playgroud)
使用任何 awk:
$ cat tst.awk
BEGIN {
OFS = "\t"
vals[++numRows,++numCols] = "chr"
vals[numRows,++numCols] = "start"
}
FNR == 1 {
val = FILENAME
sub(/\..*/,"",val)
vals[1,++numCols] = val
}
{
key = $1 FS $2
if ( !(key in key2rowNr) ) {
key2rowNr[key] = ++numRows
vals[numRows,1] = $1
vals[numRows,2] = $2
}
rowNr = key2rowNr[key]
vals[rowNr,numCols] = $3
}
END {
for ( rowNr=1; rowNr<=numRows; rowNr++ ) {
for ( colNr=1; colNr<=numCols; colNr++ ) {
val = ( (rowNr,colNr) in vals ? vals[rowNr,colNr] : 0 )
row = ( colNr>1 ? row OFS : "" ) val
}
print row
}
}
Run Code Online (Sandbox Code Playgroud)
$ awk -f tst.awk *.bed
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr2 22099 108 0 0 442
chr5 463100 219 0 0 0
chr1 430229 0 267 0 0
chr5 663800 0 319 311 0
chr3 22099 0 0 144 0
chr1 157901 0 0 0 79
Run Code Online (Sandbox Code Playgroud)
如果您希望对行进行排序,那么您可以应用装饰-排序-取消装饰方法:
$ awk -f tst.awk *.bed | awk -v OFS='\t' '{print (NR>1), $0}' | sort -k1,1n -k2,2 -k3,3n | cut -f2-
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0
Run Code Online (Sandbox Code Playgroud)
如果chr<number>
字符串末尾可以有超过 1 位数字,并且您希望这些数字按字母顺序和数字顺序排序(例如,chr2
在 之前chr10
),那么您必须将 DSU 部分更改为如下所示:
$ awk -f tst.awk *.bed | awk -v OFS='\t' '{c=$1; sub(/[[:alpha:]]+/,"&" OFS,c); print (NR>1), c, $0}' | sort -k1,1n -k2,2 -k3,3 -k5,5n | cut -f4-
chr start FF85561 FF85574 FF85631 FF85717
chr1 111001 234 42 92 129
chr1 157901 0 0 0 79
chr1 430229 0 267 0 0
chr2 22099 108 0 0 442
chr3 22099 0 0 144 0
chr5 463100 219 0 0 0
chr5 663800 0 319 311 0
Run Code Online (Sandbox Code Playgroud)