python - Amending a script to turn all the text files in a folder into CSV files -

August 15, 2010

i have large directory full of thousands of text files. have script can run each file @ once , turn csv file. amending script automatically runs through every text file , turns csv file. script have:

def split_ln(fname):     print 'processing\t',fname     #imort 2 required modules     import re     import csv     outname=fname.replace(fname.split('.')[-1],'csv') #replace   extension "csv"     #setup output file. maybe give option seperate text files, if desired.     outfile=open(outname,'wb')     writer = csv.writer(outfile)      lnraw=open(fname).read() #read file       workfile=re.sub('                copyright .*?\\r\\n','endofile',lnraw)  #silly hack find end of documents     workfile=workfile.replace('\xef\xbb\xbf\r\n','') #clean crud @ beginning of file     workfile=workfile.split('endofile') #split file list of documents.     workfile=[f f in workfile if len(f.split('\r\n\r\n'))>2] #remove blank rows      #figure out special meta data being reported     meta_list=list(set(re.findall('\\n([a-z][a-z-]*?):',lnraw))) #find them     meta_list=[m m in meta_list if float(lnraw.count(m))/len(workfile)>.20] #keep commonly occuring ones     meta_tuple=('search_row','publication','date','title','edition')     item in meta_list:         meta_tuple=meta_tuple+(item,)     writer.writerow(meta_tuple+('text',))        #begin loop on each file     f in workfile:          #split lines, , clean hard returns @ end of each line. removes blank lines occasional copyright lines           filessplit=[row.replace('\r\n',' ') row in f.split('\r\n\r\n') if len(row)>0 , 'all rights reserved' not in row]         #the id number (from search) first text in first item of list         docid=filessplit[0].lstrip().split(' ')[0]         dateedition=filessplit[2].lstrip()         date=dateedition.split(' ')[0]+' '+dateedition.split(' ')[1]+' '+dateedition.split(' ')[2].replace(',','')         edition= dateedition.replace(date,'').split('                         ')[-1].lstrip()         if 'gmt' in edition or ('day' in edition):             edition=''         title= filessplit[3]         publication=filessplit[1].lstrip()         #extra text , other information         text=''         meta_dict={k : '' k in meta_list}         line in filessplit:             if len(line)>0 , line[:2]!='  ' , line!=line.upper() , len(re.findall('^[a-z][a-z-]*?:',line))==0 , title not in line:                 text=text.lstrip()+' '+line.replace('","','" , "')             else:                 metacheck=re.findall('^([a-z][a-z-]*?):',line)                 if len(metacheck)>0:                     if metacheck[0] in meta_list:                        meta_dict[metacheck[0]]=line.replace(metacheck[0]+': ','')            #output results csv file         meta_tuple=(docid,publication,date,title,edition)         item in meta_list:             meta_tuple=meta_tuple+(meta_dict[item],)         writer.writerow(meta_tuple+(text,))                 #output.write(docid+'\t'+title+'\t'+text+'\n')        print 'wrote\t\t',outname   if __name__ == "__main__":     import sys     try:          flist=sys.argv[1:]     except:         print 'only 1 argument please. can use things *.txt'     else:         fname in flist:             split_ln(fname)         print 'done'

Search This Blog

CSS

python - Amending a script to turn all the text files in a folder into CSV files -

Comments

Post a Comment

Popular posts from this blog

php - trouble displaying mysqli database results in correct order -

depending on nth recurrence of job in control M -

sql server - Cannot query correctly (MSSQL - PHP - JSON) -