python - Amending a script to turn all the text files in a folder into CSV files -
i have large directory full of thousands of text files. have script can run each file @ once , turn csv file. amending script automatically runs through every text file , turns csv file. script have:
def split_ln(fname): print 'processing\t',fname #imort 2 required modules import re import csv outname=fname.replace(fname.split('.')[-1],'csv') #replace extension "csv" #setup output file. maybe give option seperate text files, if desired. outfile=open(outname,'wb') writer = csv.writer(outfile) lnraw=open(fname).read() #read file workfile=re.sub(' copyright .*?\\r\\n','endofile',lnraw) #silly hack find end of documents workfile=workfile.replace('\xef\xbb\xbf\r\n','') #clean crud @ beginning of file workfile=workfile.split('endofile') #split file list of documents. workfile=[f f in workfile if len(f.split('\r\n\r\n'))>2] #remove blank rows #figure out special meta data being reported meta_list=list(set(re.findall('\\n([a-z][a-z-]*?):',lnraw))) #find them meta_list=[m m in meta_list if float(lnraw.count(m))/len(workfile)>.20] #keep commonly occuring ones meta_tuple=('search_row','publication','date','title','edition') item in meta_list: meta_tuple=meta_tuple+(item,) writer.writerow(meta_tuple+('text',)) #begin loop on each file f in workfile: #split lines, , clean hard returns @ end of each line. removes blank lines occasional copyright lines filessplit=[row.replace('\r\n',' ') row in f.split('\r\n\r\n') if len(row)>0 , 'all rights reserved' not in row] #the id number (from search) first text in first item of list docid=filessplit[0].lstrip().split(' ')[0] dateedition=filessplit[2].lstrip() date=dateedition.split(' ')[0]+' '+dateedition.split(' ')[1]+' '+dateedition.split(' ')[2].replace(',','') edition= dateedition.replace(date,'').split(' ')[-1].lstrip() if 'gmt' in edition or ('day' in edition): edition='' title= filessplit[3] publication=filessplit[1].lstrip() #extra text , other information text='' meta_dict={k : '' k in meta_list} line in filessplit: if len(line)>0 , line[:2]!=' ' , line!=line.upper() , len(re.findall('^[a-z][a-z-]*?:',line))==0 , title not in line: text=text.lstrip()+' '+line.replace('","','" , "') else: metacheck=re.findall('^([a-z][a-z-]*?):',line) if len(metacheck)>0: if metacheck[0] in meta_list: meta_dict[metacheck[0]]=line.replace(metacheck[0]+': ','') #output results csv file meta_tuple=(docid,publication,date,title,edition) item in meta_list: meta_tuple=meta_tuple+(meta_dict[item],) writer.writerow(meta_tuple+(text,)) #output.write(docid+'\t'+title+'\t'+text+'\n') print 'wrote\t\t',outname if __name__ == "__main__": import sys try: flist=sys.argv[1:] except: print 'only 1 argument please. can use things *.txt' else: fname in flist: split_ln(fname) print 'done'
Comments
Post a Comment