Here's a code chunk that has the same issues; Once it seems to read in the genome in the function it never releases this memory. This happens even in much longer running programs until the memory overloads.
from bio import *
def wgz(floc: str, ob: List[str]):
with open(floc, 'wb') as afile:
for item in ob:
afile.write(f'{item}\n')
def rgz(floc: str) -> List[str]:
return_list = List[str]()
with open(floc, 'rb') as file:
return_list = [line.strip() for line in file]
return return_list
def readch(floc):
anewchrom = rgz(floc)
anewchrom.clear()
del anewchrom
return 0
@python
def getmemory():
import psutil, os
process = psutil.Process(os.getpid())
print(process.memory_info().rss)
reduced_chrom_dict = ['chr1', 'chr10', 'chr11', 'chr12', 'chr13','chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19','chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8','chr9']
sex_chroms = ['chrX', 'chrY']
chroms = List[str]()
for r in FASTA('/home/assrivat/hg38.fa', fai = False):
if(r.name in reduced_chrom_dict):
print(r.name)
chroms.append(str(r.seq).upper())
chroms.append(str(r.seq).upper())
elif(r.name in sex_chroms):
print(r.name)
chroms.append(str(r.seq).upper())
else:
print('skipped')
getmemory()
wgz('/media/drive2/assrivat/randomtest.gz',chroms)
chroms.clear()
del chroms
getmemory()
readch('/media/drive2/assrivat/randomtest.gz')
getmemory()
readch('/media/drive2/assrivat/randomtest.gz')
getmemory()
readch('/media/drive2/assrivat/randomtest.gz')
getmemory()
The memory returns were
6942056448
6942244864
12526358528
14309744640
14806085632
from bio import *
@python
def getmemory():
import psutil, os
process = psutil.Process(os.getpid())
print(process.memory_info().rss)
reduced_chrom_dict = ['chr' + str(n) for n in range(1, 23)]
sex_chroms = ['chrX', 'chrY']
floc = 'assrivat_out.txt'
getmemory()
with open(floc, 'wb') as afile:
for r in FASTA('data/hg38.fa.gz', fai=False):
if (r.name in reduced_chrom_dict):
print(r.name)
afile.write(f'{r.seq}\n')
elif (r.name in sex_chroms):
print(r.name)
afile.write(f'{r.seq}\n')
else:
# print('skipped')
continue
getmemory()
getmemory()
list.clear()
does not actually clear/free/reset anything, it just sets the list length to 0, but list elements are still referenced by the underlying array. This could be why the memory is never freed. Can you try adding the following snippet to the top of your program?
@extend
class List:
def clear(self):
from internal.gc import sizeof
str.memset(self.arr.ptr.as_byte(), byte(0), self.len * sizeof(T))
self.len = 0
clear()
methods then to actually do this
def speedSNP(seqs):
import random
BASES = ['A', 'C', 'T', 'G']
c = list(range(len(seqs)))
chrom = random.sample(c,1)
seq = seqs[chrom[0]]
target = GLOBAL_CHROM_NUM[chrom[0]]
n = len(seq)
pos = 0
all_positions = []
if(n==0):
pos = -1
elif(n==1):
n_repeats = 1
all_positions = [0]
for i in range(n_repeats):
oldseq = seq
char = oldseq[pos]
while(char == seq[pos]):
char = random.choice(BASES)
seq = seq[:0]+char
else:
n_repeats = 1
for i in range(n_repeats):
pos = random.randint(0,n-1)
all_positions.append(pos)
oldseq = seq
char = oldseq[pos]
while(char == seq[pos]):
char = random.choice(BASES)
seq = seq[:pos] + char + seq[pos+1:]
seqs[chrom[0]] = seq
return [-1], [pos, chrom[0], target]
Can anyone see an indentation problem with this? I keep getting this dedent error and I double checked all the indentation/formatting
Find a bug in Sequence aligment
from bio import *
# default parameters
s1 = s'CGGAAGAGCGTTTTCAGTTCATCAGGTGTGAAT'
s2 = s'CGGAAGAGCGTTTTCAGTTAATCAGGGGTGAAT'
aln = s1 @ s2
print(aln.cigar, aln.score) # 33M -2
# custom parameters
# match = 2; mismatch = 4; gap1(k) = 2k + 4; gap2(k) = k + 13
aln = s1.align(s2, a=2, b=4, gapo=4, gape=2, gapo2=13, gape2=1)
print(aln.cigar, aln.score) # 33M 54
There is a Mismacth in the sequence as floows:
CGGAAGAGCGTTTTCAGTT C ATCAGGTGTGAAT
CGGAAGAGCGTTTTCAGTT A ATCAGGGGTGAAT
But the aligment result said it's match
Find a bug in Sequence aligment
from bio import * # default parameters s1 = s'CGGAAGAGCGTTTTCAGTTCATCAGGTGTGAAT' s2 = s'CGGAAGAGCGTTTTCAGTTAATCAGGGGTGAAT' aln = s1 @ s2 print(aln.cigar, aln.score) # 33M -2 # custom parameters # match = 2; mismatch = 4; gap1(k) = 2k + 4; gap2(k) = k + 13 aln = s1.align(s2, a=2, b=4, gapo=4, gape=2, gapo2=13, gape2=1) print(aln.cigar, aln.score) # 33M 54
There is a Mismacth in the sequence as floows:
CGGAAGAGCGTTTTCAGTT C ATCAGGTGTGAAT
CGGAAGAGCGTTTTCAGTT A ATCAGGGGTGAAT
But the aligment result said it's match
I know the reason,sorry
# Read whitelisted barcodes from file and convert to a set of Kmers. (K = e.g. Kmers[16])
bc_whitelist = read_barcode_whitelist_from_file[K](
bc_whitelist_filename=bc_whitelist_filename,
bc_column_idx=0,
warning=True
)
# Gives an error:
correct_barcode_in_fastq.seq:109:20: error: cannot find '__getitem__' in std.software.single_cell_toolkit.seq_lib.barcode_correction.read_barcode_whitelist_from_file[str,int,bool,kmer_type]
# Function signature:
def read_barcode_whitelist_from_file[kmer_type](bc_whitelist_filename: str, bc_column_idx: int = 0, warning: bool=True):
...
# OLD:
def foo[T](x: T):
pass
# NEW:
def foo(x: T, T: type):
pass
Static[int]
:
def foo(k: Static[int]):
... Kmer[k] ...
k
will need to be a compile-time constant; you can also pass these on the command-line via -Dk=42
or the like