#! /usr/bin/perl
#use utf8;
use XML::Parser::Lite;
use Socket;
use Encode qw(:all);
#use threads;
#use threads::shared;
$ttl=15; # minutes
$url='http://news.eu.by/';
$subject='Belarus';
$root=substr($0,0,rindex($0,'/'));
$web="$root/html";
$pub="$web/dir";
$temp="$root/tmp";
$xmlname="newz";
$deltaname="newz";
$htmlname="newz";
#$dbname="$temp/newz";
#my $enc='.gz';
$enc='';
$title='Breaking News!';
$rotate_time=5400; # sec;
#$proxy='195.50.2.154:8080';
@or=('Belarus','Belorussia','Byelorussia','Belarussian','Byelorussian','Bielorussia','Bielorusso','Bielorussa','Belarusse','Bjellorusi');
@or_ru=('Беларусь','Белоруccия','Беларуси','Белоруссии','Белорусский','Белорусская','Белорусское','Белорусские','Беларуский','Беларуская','Беларуское','Беларуские','Белорусских','Беларуских');
my $encoding='utf8';
my $arc="7z";
my $arc_month=-1;
my @month=('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec');
my %mime_xml=('*/*'=>1,'text/html'=>1,'application/xml'=>1,'text/xml'=>1,'application/rss+xml'=>1,'text/plain'=>1);
my $timeout=15;
my $translator=1; # 0-2
$|=1;
alarm($ttl*60-30);
my %arcs=(
"tar.bz2"=>sub{exec("tar -cjf $_[0] --remove-files $_[1]")},
"tar.gz"=>sub{exec("tar -czf $_[0] --remove-files $_[1]")},
"rar"=>sub{exec("rar m -m5 -ds -md4096 -s -inull $_[0] $_[1]")},
"7z"=>sub{
(system("7z a -t7z -m0=lzma -mx=9 -mfb=64 -md=64m -ms=on -bd $_[0] $_[1]")==0)||return 0;
unlink <$_[1]> if($? == 0 &&-e $_[0])
}
);
&install; # if($ARGV[0] eq 'install');
do "$root/fin.pl";
system("mv $temp/fin.xml $temp/fin.xml0");
my $fail=0;
my $heads=qq(
$title
);
my $ad1=q(
);
my $ad2=q(
);
my %cont:shared=(
'root.eu.by'=>'Dzianis Kahanovich',
'www.bspu.unibel.by'=>'Belarussian State Pedagogical University',
'www.belta.by'=>'BELTA',
'news.google.com'=>'Google News',
'finance.google.com'=>'Google Finance',
'www.idealist.org'=>'Idealist.org'
);
my $or_='+OR+';
#my $or_='+%7C+';
my $mimes=join(',',keys(%mime_xml));
my %all;
my %goo=(
''=>{q=>join($or_,qqq(@or))},
'de'=>{q=>join($or_,qqq('Weißrussland','Belarus','Belorussland','Weißrußland','Belorußland','weißrussisch','belarussich','weißruthenisch','Belarusse','Weißrusse','Belarussin','Weißrussin','Weißrussische'))},
'nl_nl'=>{q=>join($or_,qqq('Wit-Rusland','Wit-Russisch','Wit-Rus','Wit-Russin','Wit-Russische','Bjelo-Rusland','Bjelorussisch','Bjelorussische','Bjelorus','Belarus'))},
'fr'=>{q=>join($or_,qqq('Belarus','Biélorussie','Biélorusse','Bielaroussy','Bielarouss','Biélarussie','Biélarusse'))},
'es'=>{q=>join($or_,qqq('Bielorrusia','Belarús','"Rusia+Blanca"','bielorruso','bielorrusa','Belarus'))},
'pt-PT'=>{q=>join($or_,qqq('Bielo-Rússia','Bielorússia','Bielorrússia','bielorrusso','bielorrussa','Belarus'))},
#'ja_jp'=>{ned=>'ja_jp',hl=>'ja',q=>join($or_,qqq('ベラルーシ語','ベラルーシ'))},
'ja'=>{ned=>'us',hl=>'ja',q=>join($or_,qqq('ベラルーシ語','ベラルーシ'))},
'zh_cn'=>{scoring=>'',q=>join($or_,qqq('白俄羅斯'))},
'iw'=>{ned=>'iw_il',q=>join($or_,qqq('בלארוס'))},
'hi'=>{ned=>'hi_in',q=>join($or_,qqq('बेलारूसी','बेलारूस'))},
'ar'=>{ned=>'ar_me',q=>join($or_,qqq('"روسيا البيضاء"'))},
'ru'=>{ned=>'ru_ru',q=>join($or_,qqq(@or_ru))}
);
my $ya=join('%7C',qqq(@or_ru));
#push @lang,(
my %lang_web=('ru'=>1);
my @lang=('ru');
#'es_AR','au','nl_be','fr_be','en_bw','ca','fr_ca','cs_cz','es_cl','es_co','es_cu','es_us','en_et','en_gh','in','en_ie','en_il','en_ke','hu_hu','en_my','es_mx','en_na','nz','en_ng','no_no','de_at','en_pk','es_pe','en_ph','pl_pl','de_ch','fr_sn','en_sg','en_za','fr_ch','sv_se','en_tz','tr_tr','uk','us','en_ug','es_ve','vi_vn','en_zw','el_gr','ru_ua','uk_ua','iw_il','ar_ae','ar_sa','ar_me','ar_lb','ar_eg','hi_in','ta_in','te_in','ml_in','kr','cn','tw','hk'
for('en','es','fr','de','nl_nl','it','pt-PT_pt','pt-BR_br','ja','zh_cn','iw','hi','ar'){
push @lang,$_;
$lang_web{$_}=2;
}
my $langs=join(',','be',@lang,'en-us');
my $results=0;
sub mon{
for(0..$#month){
if(index($_[0],$month[$_])>=0){
$_+=$_[1];
$_+=12 while($_<0);
return $month[$_]
}
}
}
sub arc_exit{
chdir($pub)||die $!;
while(<$xmlname.*.xml>){
my $m=mon($_);
my $time=gmtime;
for my $mm($arc_month..0){goto NN if($m eq mon($time,$mm))}
my $x=$_;
my $y="!";
$x=~s/(\d{4})/$y=$1/ex;
my $f="*$m*$y*";
my $fa="$xmlname.$y.$m.$arc";
if(!-e $fa){
&{$arcs{$arc}}($fa,$f);
system("chmod 444 $pub/*");
exit
}
NN:
}
}
sub fn{
substr($_[1],-3) eq '.gz'?($_[0]?"|gzip -9 >$_[1]":"gzip -dc <$_[1]|"):($_[0]?">$_[1]":"<$_[1]");
}
sub qqq{
my @x=(@_);
for(@x){
$_=~s/([^a-zA-Z])/sprintf('%%%02X',ord($1))/eg;
$all{$_}=1
}
@x
}
sub add_cont{
my $l=$_[0].'/';
my $x;
$l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei;
$cont{$x}||=qm($_[1]);
}
sub cur_cont{
my ($i,$r,$c);
for(sort keys %cont){
$i=unesc($_);
$c=unqm($cont{$_});
$r.="
".url("http://$i",$c) if(index($_[0],$_)>=0 || index($_[0],$i)>=0 || $_ ne $c);
}
$r
}
sub esc{
my $x=shift;
$x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg;
$x;
}
sub unesc{
my $x=shift;
local $1;
$x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$x;
}
sub qm{
#quotemeta($_[0])
my $x=shift;
$x=~s/([\'\\])/\\$1/gs;
$x=~s/\r/\\r/gs;
$x=~s/\n/\\n/gs;
$x;
}
sub unqm{
my $x=shift;
$x=~s/\\(.)/$1/g;
$x;
}
sub get_xml{
my ($gurl,$req,$opt,$wget)=(@_);
$wget=0;
my ($s,%h,@a,@a1,@ad,$x);
@a=split(/:\/\//,$gurl,2);
unshift @a,'http' if(!defined($a[1]));
@a[1,3]=split(/\//,$a[1],2);
@ad=@a[1,2]=split(/:/,$a[1],2);
@a1=@a;
$ad[1]||=80;
$a1[0]&&="$a[0]://";
$a1[2]&&=":$a[2]";
$a1[3]&&="/$a[3]";
if($proxy){
@ad=split(/:/,$proxy,2);
$a1[3]=join('',@a1);
}
print "+";
if ($wget) {
open SO, "wget --save-headers --timeout=$timeout -t1 ".($proxy?"-Y1 -e http_proxy=$proxy":"-Y0")." -O - ".quotemeta($gurl)." |" || goto ERR;
} else {
socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&&
setsockopt(SO,SOL_SOCKET,SO_SNDTIMEO,pack('L!L!',$timeout,0))&&
setsockopt(SO,SOL_SOCKET,SO_RCVTIMEO,pack('L!L!',$timeout,0))&&
connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))&&goto OK;
ERR:
close(SO);
$fail++;
return;
OK:
select(SO);$|=1;select(STDOUT);
vec($vec_in='',fileno(SO),1)=1;
$x=qq($req $a1[3] HTTP/1.1
Host: $a1[1]
User-Agent: robot $url
Accept: $mimes
Accept-Language: $langs
Accept-Encoding: gzip
${opt}Connection: close
);
$x=~s/\n/\r\n/gs; # yandex lighthttp bug
print(SO $x)||goto ERR;
}
while(defined($x=)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last}
for(split(/[\r]\n/,$s)) {
$_=~s/^(.*?): (.*?)$/$h{lc($1)}=$2;''/gise;
$s=$_ if($_);
}
if(index($s,' moved ')>=0){
((caller(0))[3] eq (caller(3))[3]) && goto ERR;
close(SO);
return get_xml($h{'location'},$_[1],$_[2])
}
$s='';
if($req ne 'HEAD') {
while(defined($x=)&&((!defined($h{'content-length'}))||length($s)<$h{'content-length'})){
if($h{'transfer-encoding'} eq 'chunked' && !$wget) {
$x=~s/[\n\r]//gs;
($x eq '')&&next;
my $n=hex($x)||last;
my $x1;
$x='';
while(defined($x1=)&&length($x.=$x1)<$n){}
defined($x1)||last;
$x=substr($x,0,$n);
}
$s.=$x;
}
if(lc($h{'content-encoding'}) eq 'gzip'){
$x="$temp/__gzip";
open my $F,"|gzip -dc >$x" || die "gzip decoding failed";
print $F $s;
close($F);
open $F,"<$x" || die "gzip decoding failed";
sysread($F,$s,-s $F);
close($F);
unlink $x;
}
print "\n!!! ERROR - no /rss>\n" if(index(lc($s),'/rss>')<0);
}
close(SO);
$s,%h
}
sub url{
my $u=$_[0];
my $t=$_[1]||unesc($u);
"$t"
}
my (@block,%item,%channel,@items,$cnt0,$cnt0_);
## 'id'=>[start,char,end,start1,char1.end1];
my %blocks=(
'rss.channel'=>[
sub{%channel=()},
undef,
undef,
undef,
sub{shift;$channel{$block[$#block]}=join('',@_)}
],
'rss.channel.item'=>[
sub{%item=()},
undef,
sub{push @items,{%item};undef %item},
undef,
sub{shift;$item{$block[$#block]}=join('',@_)},
undef
],
'rss.channel.item.link'=>[
]
);
#$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'};
my %handlers=(
Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event},
Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event},
End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event},
);
sub parser_event{
my $e=shift;
my $id=join('.',@block[0..$#block-($e>3)]);
#print "$id\n";
if(exists($blocks{$id})){
my $h=$blocks{$id};
goto ref($h)||return;
HASH:return $h->{('Start','Char','End')[$e]}(@_);
ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef;
SCALAR:return;
}
}
sub _var{
my($s,$v,$r)=(@_);
$s=~s/[\s\t\r\n]$v\=['"]?([^\s\t\r\n'"]*)/$r=$1/es; #'"
$r;
}
sub get_rss{
my $url=shift;
my $head=shift;
my $retry=2;
RETRY:
my ($x,%h,$ff,$ffb,$t,$p,$lm);
print "get $url\n";
if(substr($url,0,7) eq 'file://'){
$ff=substr($url,7);
return if(!-e $ff);
goto FILE;
}
add_cont($url);
if($head){
($x,%h)=get_xml($url,'HEAD');
return if(!defined(%h));
}
if(exists($h{'last-modified'})){
$ffb="$temp/newz-".esc($url);
stat($ff="$ffb.".esc($h{'last-modified'}));
if(-e _){
FILE:
print "== $ff\n";
open(FF,fn(0,$ff)) or die "$! $ff";
$x='';
while(!eof(FF)&&(my $s=)){$x.=$s}
close(FF);
}else{
($x,%h)=get_xml($url,'GET',undef,$_[2]);
return if(!defined($x));
}
}else{($x,%h)=get_xml($url,'GET',undef,$_[2]);
}
$p=new XML::Parser::Lite;
$p->setHandlers(%handlers);
my ($e,$a);
$x=~s/(<\?xml.*?>)/$a.=$1;$1/es;
$e=_var($a,"encoding")||_var($h{"content-type"},"charset")||"utf-8";
$x=~s/\<\!\[CDATA\[(.*?)\]\]\>/htmlz($1);$1/gse;
from_to($x,$e,$encoding,HTMLCREF) if($e && $e ne $encoding);
undef %channel;
@items=();
if(index($x,'/rss>')<0){
return if(($mime_xml{'*/*'}&&$h{'content-type'})||$mime_xml{lc((split(/;/,$h{'content-type'}))[0])});
print "\nERROR: $h{'content-type'} $url\n";$retry--?goto RETRY:return
}
my $OLDDIE=$SIG{__DIE__};
$SIG{__DIE__}=sub{defined($ff) && unlink($ff); print "Parser error: $_[0]\n"};
eval('$p->parse($x)');
$SIG{__DIE__}=$OLDDIE;
## debug:
#$ff=substr(($ffb="$temp/newz-".esc($url)).".".esc($h{'last-modified'}||=gmtime),0,128);
if(exists($h{'last-modified'})){
while(my $d=<$ffb.*>){unlink($d)}
wrf($ff,$x) if($ff);
}
$x=$channel{title};
$x=~s/[\:\n].*//s;
#add_cont($channel{link},quotemeta($x)) if($channel{link});
add_cont($channel{link},$x) if($channel{link});
addnews(@_);
1
}
my %htm=(
'lt'=>'<',
'gt'=>'>',
'amp'=>'&',
'quot'=>'"'
);
sub dehtml{
my $s=shift;
$s=~s/\&(.*?)\;/$htm{$1}||"\&$1;"/gse;
$s
}
sub htmlz{
my $s=shift;
for(keys %htm){$s=~s/$htm{$_}/$_/gs}
}
my @news0:shared;
my %news:shared;
my %nh:shared;
my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);
sub nkey{$_[0]->{xlink}||$_[0]->{link}||$_[0]->{description}}
# [param_redirect[,lang[,no_sort_by_time]]]
sub addnews{
my ($l,$x);
my $u=$_[0];
for(@items){
$l=$_->{link};
$_->{lang}||=ref($_[1]) eq 'CODE'?&{$_[1]}($_):$_[1];
if($u){
$x="$l\&";
$x=~s/[\&\?\;]$u\=(.*?)\&/$l=unesc(dehtml($1));''/gse;
if($x ne "$l\&"){
$l="http://$l" if(index($l,'://')==-1);
$_->{xlink}=$l;
}
};
$l=nkey($_);
if(exists($nh{$l})){$nh{$l}++}
else{
if($_[2]){
$nh{$l}=0;
push @news0,$_;
}else{
$nh{$l}=1;
my ($t1,$t)=(0,$_->{pubDate});
$t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e;
$t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e;
$t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex;
$t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex;
$t1++ while(exists($news{$t1}));
$news{$t1}=$_
}
add_cont($l)
}
}
}
sub mv{rename($_[0],$_[1])||`mv -f $_[0] $_[1]`}
my $time=gmtime;
sub time2h{
my $x="$_[0] GMT";
$x=~s/ /\ \;/gs;
$x
}
my ($t0,$time0,$tstamp,$counter,$rotate);
if(open(FT,fn(0,"$pub/time$enc"))){
$t0=;chomp($t0);
$time0=;chomp($time0);
$counter=;chomp($counter);
$tstamp=;chomp($tstamp);
close FT;
print "time: ",time-$t0,"\n";
};
$tstamp||=time;
sub cp{
my $x=quotemeta($_[0]);
my $y=quotemeta($_[1]);
`cp -f $x $y`
}
if($ARGV[0] eq 'test'){$web=$pub=$temp;}
get_rss("file://$pub/$xmlname.xml$enc",0,undef,'ru',1);$cnt0_=$cnt0=$#news0;
if($ARGV[0] ne 'test' && (time-$t0>$rotate_time || $ARGV[0] eq 'rotate')){
cp("$pub/$xmlname.xml$enc","$pub/$xmlname.$time.xml$enc");
cp("$pub/$htmlname.html$enc","$pub/$htmlname.$time.html$enc");
undef $t0;
$rotate=1;
$cnt0=-1;
}
if(!defined($t0)){($t0,$time0,$counter)=(time,$time,0)}
my $hhead=qq(