| 4 |
use LWP::UserAgent; |
use LWP::UserAgent; |
| 5 |
use URI; |
use URI; |
| 6 |
use Cwd; |
use Cwd; |
| 7 |
|
use DBI; |
| 8 |
|
|
| 9 |
my $BASEURL = "http://lists.debian.org/debian" ; |
my $BASEURL = "http://lists.debian.org/debian" ; |
| 10 |
my @PROJECTS = ('med', 'edu', 'jr') ; |
my @PROJECTS = ('med', 'edu', 'jr') ; |
| 11 |
my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'); |
my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'); |
| 12 |
my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster'); |
my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster', |
| 13 |
|
'Debian-med-request', 'Debian testing watch', 'Debian Bug Tracking System', |
| 14 |
|
'Skolelinux archive Installer'); |
| 15 |
|
my @SPAMAUTHORS = ('Pls check this new site'); |
| 16 |
|
|
| 17 |
# Debian-Jr starts in 2000 |
# Debian-Jr starts in 2000 |
| 18 |
my $YEARSTART = 2000; |
my $YEARSTART = 2000; |
| 20 |
my ($sec,$min,$hour,$day,$MONTHEND,$YEAREND,$wday,$yday,$isdst) = localtime(time); |
my ($sec,$min,$hour,$day,$MONTHEND,$YEAREND,$wday,$yday,$isdst) = localtime(time); |
| 21 |
$MONTHEND++; |
$MONTHEND++; |
| 22 |
$YEAREND +=1900; |
$YEAREND +=1900; |
| 23 |
|
$day++; |
| 24 |
|
my $today = "$YEAREND-$MONTHEND-$day"; |
| 25 |
|
|
| 26 |
|
my $dbname = 'cddlistarchives'; |
| 27 |
|
my $dbh = DBI->connect("dbi:Pg:dbname=$dbname"); |
| 28 |
|
|
| 29 |
my $ua = LWP::UserAgent->new( agent => 'varbot'); |
my $ua = LWP::UserAgent->new( agent => 'varbot'); |
| 30 |
$ua->env_proxy; |
$ua->env_proxy; |
| 31 |
|
|
| 32 |
my $cdw = getcwd; |
my $cdw = getcwd; |
| 33 |
my $project; |
my $project; |
| 34 |
|
my $insert = "INSERT INTO listarchive (project, yearmonth, author, subject, url, ts) VALUES (?, ?, ?, ?, ?, '$today')"; |
| 35 |
|
my $datain = $dbh->prepare_cached($insert); |
| 36 |
|
my ( $robot, $robotflag ); |
| 37 |
|
|
| 38 |
foreach $project (@PROJECTS) { |
foreach $project (@PROJECTS) { |
| 39 |
|
# Remove database entries for this project |
| 40 |
|
my $query = "DELETE FROM listarchive WHERE project = '$project'"; |
| 41 |
|
my($daten) = $dbh->prepare_cached($query); |
| 42 |
|
$daten->execute() ; |
| 43 |
|
$daten->finish() ; |
| 44 |
|
|
| 45 |
mkdir($project,0777); |
mkdir($project,0777); |
| 46 |
chdir($project); |
chdir($project); |
| 47 |
my $URL="${BASEURL}-${project}"; |
my $URL="${BASEURL}-${project}"; |
| 56 |
my $datafile = "${year}-${month}" ; |
my $datafile = "${year}-${month}" ; |
| 57 |
unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); } |
unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); } |
| 58 |
my $messagelines = 0; |
my $messagelines = 0; |
| 59 |
my $spamlines = 0; |
my $spamlines = 0; |
| 60 |
|
my $robotlines = 0; |
| 61 |
while ( $url =~ /.+/ ) { # if only one page $url is set to '' |
while ( $url =~ /.+/ ) { # if only one page $url is set to '' |
| 62 |
# print "$year-$month: $url\n"; |
# print "$year-$month: $url\n"; |
| 63 |
my $uri = URI->new($url); |
my $uri = URI->new($url); |
| 70 |
next; |
next; |
| 71 |
} ; |
} ; |
| 72 |
(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs; |
(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs; |
| 73 |
my ($content, $subject, $author, $messages, $pages, $page) ; |
my ($content, $msgurl, $subject, $author, $messages, $pages, $page) ; |
| 74 |
foreach $content (@data) { |
foreach $content (@data) { |
| 75 |
my @lines = split(/(\n)/, $content); |
my @lines = split(/(\n)/, $content); |
| 76 |
# print "------> @lines\n" ; |
# print "------> @lines\n" ; |
| 87 |
# Append next line |
# Append next line |
| 88 |
$line = $linestart . $line; |
$line = $linestart . $line; |
| 89 |
} |
} |
|
print "DEBUG: Whole line is $line\n" ; |
|
| 90 |
$linestart = ''; |
$linestart = ''; |
| 91 |
} |
} |
| 92 |
if ( $line =~ /^\s*<\/?ul>\s*$/ || |
if ( $line =~ /^\s*<\/?ul>\s*$/ || |
| 95 |
$line =~ /^\s*<li><em>Message not available<\/em>/ || |
$line =~ /^\s*<li><em>Message not available<\/em>/ || |
| 96 |
$line =~ /<em>\(continued\)<\/em>\s*$/ || |
$line =~ /<em>\(continued\)<\/em>\s*$/ || |
| 97 |
$line =~ /^\s*$/) { next ; } |
$line =~ /^\s*$/) { next ; } |
| 98 |
if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) { |
if ( ($msgurl, $subject, $author) = |
| 99 |
|
$line =~ m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) { |
| 100 |
$_ = $subject ; |
$_ = $subject ; |
| 101 |
$_ =~ s/^Re:\s*//i ; # Remove Re: |
$_ =~ s/^Re:\s*//i ; # Remove Re: |
| 102 |
$_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following) |
$_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following) |
| 106 |
print "Potential SPAM line - strange subject: $project $year-$month: $subject\n"; |
print "Potential SPAM line - strange subject: $project $year-$month: $subject\n"; |
| 107 |
$spamlines++ ; |
$spamlines++ ; |
| 108 |
} else { |
} else { |
| 109 |
print HTMLSNIP "$subject ; $author\n"; |
if ( $author =~ /^[&#x\d;\sA-F\?]+$/ || |
| 110 |
$messagelines++ ; |
$author =~ /info/i ) { # never had a non-spam message from an author whos name contains info |
| 111 |
|
print "Potential SPAM line - strange author: $project $year-$month: $author\n"; |
| 112 |
|
$spamlines++ ; |
| 113 |
|
} else { |
| 114 |
|
if ( $author =~ /^Tille, Andreas$/ ) { $author = 'Andreas Tille'; } |
| 115 |
|
if ( $author =~ /Steffen Möller/ ) { $author = 'Steffen Moeller'; } |
| 116 |
|
$_ = $author; |
| 117 |
|
$_ = s/ö/ö/g ; |
| 118 |
|
$_ = s/ü/ü/g ; |
| 119 |
|
$robotflag = 0; |
| 120 |
|
foreach $robot (@ROBOTS) { |
| 121 |
|
if ( $author =~ /$robot/ ) { # we are not interested in automatic mails |
| 122 |
|
$robotlines++ ; |
| 123 |
|
$robotflag = 1 ; |
| 124 |
|
last; |
| 125 |
|
} |
| 126 |
|
} |
| 127 |
|
if ( $robotflag == 0 ) { |
| 128 |
|
print HTMLSNIP "$subject ; $author\n"; |
| 129 |
|
$datain->execute($project, "$year-$month-01", $author, $subject, |
| 130 |
|
"${URL}/${year}/${month}/$msgurl") ; |
| 131 |
|
$messagelines++ ; |
| 132 |
|
} |
| 133 |
|
} |
| 134 |
} |
} |
| 135 |
} else { |
} else { |
| 136 |
if ( ($messages, $page, $pages) = $line |
if ( ($messages, $page, $pages) = $line |
| 143 |
$url = ''; |
$url = ''; |
| 144 |
} |
} |
| 145 |
print HTMLSNIP "$messages Messages (counted $messagelines)\n"; |
print HTMLSNIP "$messages Messages (counted $messagelines)\n"; |
| 146 |
if ( $messages != $messagelines + $spamlines ) { |
if ( $messages != $messagelines + $spamlines + $robotlines ) { |
| 147 |
print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n"; |
print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n"; |
| 148 |
} |
} |
| 149 |
} else { |
} else { |
| 150 |
unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ... |
unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ... |
|
print "DEBUG: Continued line $line\n" ; |
|
| 151 |
$linestart = $line; |
$linestart = $line; |
| 152 |
##next ; ##### ??????? if this line is missing line we get $linestart$linestart ... |
##next ; ##### ??????? if this line is missing line we get $linestart$linestart ... |
| 153 |
} else { |
} else { |
| 169 |
chdir($cdw); |
chdir($cdw); |
| 170 |
} |
} |
| 171 |
|
|
| 172 |
|
$datain->finish; |