PERL - search for string in text, gets results with substrings as well -
i searching in txt files, find text contain characters, , move them in folder...
i searching next 2 keywords:
95-b/a 95-asb/a
my code looks that
1st edit : set entire code
use warnings; utilize file::copy; utilize file::basename; (%count,%countnegative,%countpositive,$i,$j,$key,@keys,@keysnegative,@keyspositive,$token,$tokennegative,$tokenpositive,@tokens,@tokensnegative,@tokenspositive,$totalcount,$negativecount,$positivecount,$totalcountnegativeintext,$totalcountpositiveintext); @files = <*.txt>; foreach $filetoprocess (@files) { open(infile,"<$filetoprocess") or die("cannot open file"); while (<infile>) { @tokens = &tokenize($_); foreach $token (@tokens) { if ($token =~ /[a-za-z]/) { $count{$token} = $count{$token} ? $count{$token}+1 : 1; } } } @keys = keys %count; @keys = sort { $count{$b} <=> $count{$a} } @keys; ($i=0;$i<=$#keys;$i++) { if ((lc $keys[$i] eq lc '95-b/a') || (lc $keys[$i] eq lc '95-asb/a')) { $oldlocation = $filetoprocess; $newlocation = '95ba'; file::copy::move($oldlocation, $newlocation); } } close(infile); } exit(0); utilize strict; $true = 1; $false = 0; $text = ""; $word; # read text while (<>) { $text .= $_; } foreach $word (&tokenize($text)) { &printtext(&rule3(&rule2(&rule1(&makeunits(&cleanup($word)))))); } print "\n"; exit(0); sub tokenize { $_ = $_[0]; s/\s+/\n/g; s/^\n//; s/$/\n/; s/([.,!?:;,])\n/\n$1\n/g; s/\n(["'`])([^\n])/\n$1\n$2/g; s/([^\n])(["'`])\n/$1\n$2\n/g; s/([^\n])([.,])\n/$1\n$2\n/g; s/\n([a-z])\n\./\n$1./g; s/\n\.\n([^"a-z])/\.\n$1/g; s/(\.[a-z]+)\n\.\n/$1.\n/g; s/([^\n])'s\n/$1\n's\n/g; s/([^\n])n't\n/$1\nn't\n/g; s/([^\n])'re\n/$1\n're\n/g; s/\n\$([^\n])/\n\$\n$1/g; s/([^\n])%\n/$1\n%\n/g; s/mr\n\.\n/mr.\n/g; return(split(/\n/,$_)); } sub printtext { $i; ($i=0;$i<@_;$i++) { print join('',reverse(split(//,&breakunits($_[$i])))); } print " "; }
it picks ones have 95-b/a 95-asb/a, picks have 95-b , 95-asb (i don't want that, want pick ones 95-b/a , 95-asb/a).
i suppose doing wrong handling forwards slash? have solution?
thanks in advance
2nd edit : think if set string check within loop find tokens, works fine.. messing keys in hash when doing check in next step, don't see reason shouldn't doing on first step tokens.. think?
@files = <*.txt>; foreach $filetoprocess (@files) { open(infile,"<$filetoprocess") or die("cannot open file"); while (<infile>) { @tokens = &tokenize($_); foreach $token (@tokens) { if ($token =~ /[a-za-z]/) { if (($token eq '95-b/a') || ($token eq '95-asb/a')) { $oldlocation = $filetoprocess; $newlocation = '95ba'; file::copy::move($oldlocation, $newlocation); } $count{$token} = $count{$token} ? $count{$token}+1 : 1; } } }
i removed unneeded parts of code , modified more readable. it's clear problem is: %count
global, need new %count
each file.
#!/usr/bin/perl utilize warnings; utilize strict; @files = glob '*.txt'; $filetoprocess (@files) { %count; # <---- here. declare %count in loop. open $in, '<', $filetoprocess or die "cannot open $filetoprocess: $!"; while (<$in>) { $token (tokenize($_)) { if ($token =~ /[a-za-z]/) { ++$count{$token}; # ternary ? : not needed. } } } @keys = sort { $count{$b} <=> $count{$a} } keys %count; $key (@keys) { if (lc $key eq lc '95-b/a' or lc $key eq lc '95-asb/a') { print "move $filetoprocess because of $key.\n" } } } sub tokenize { $_ = $_[0]; s/\s+/\n/g; s/^\n//; s/$/\n/; s/([.,!?:;,])\n/\n$1\n/g; s/\n(["'`])([^\n])/\n$1\n$2/g; s/([^\n])(["'`])\n/$1\n$2\n/g; s/([^\n])([.,])\n/$1\n$2\n/g; s/\n([a-z])\n\./\n$1./g; s/\n\.\n([^"a-z])/\.\n$1/g; s/(\.[a-z]+)\n\.\n/$1.\n/g; s/([^\n])'s\n/$1\n's\n/g; s/([^\n])n't\n/$1\nn't\n/g; s/([^\n])'re\n/$1\n're\n/g; s/\n\$([^\n])/\n\$\n$1/g; s/([^\n])%\n/$1\n%\n/g; s/mr\n\.\n/mr.\n/g; homecoming (split /\n/); }
perl
No comments:
Post a Comment