Wikipedia:WikiProject Wikidemia/Quant/Code/parsexml


#!/bin/tcsh
setenv WDIR ~tobacman/bulk/data/wiki/dumps
setenv WFILE idwiki-20060506-pages-meta-history.xml

date

rm -f $WDIR/headers.raw1 $WDIR/headers.raw2 $WDIR/headers.raw

# grep the headers
sed -e '/<timestamp>/b' -e '/<ip>/b' -e '/<username>/b' -e '/<title>/b' -e '/<comment>/b' -e '/<id>/b' -e \
'/<minor \>/b' -e '/<revision>/b' -e d $WDIR/$WFILE > $WDIR/headers.raw1

# delete BOTH leading and trailing whitespace from each line
sed 's/^[ \t]*//;s/[ \t]*$//' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1

# substitute "foo" with "bar" ONLY for lines which contain "baz"
sed '/<comment>/s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed '/<title>/s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed '/<timestamp>/s/T/,/g;/<timestamp>/s/Z//' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1

# if a line begins with an equal sign, append it to the previous line
# and replace the "=" with a single space
# sed -e :a -e '$!N;s/\n=/ /;ta' -e 'P;D'
# <timestamp> <ip> <username> <title> <comment> <id> <minor \> <revision>

sed -e :a -e '$\!N;s/\n<id>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed -e :a -e '$\!N;s/\n<timestamp>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed -e :a -e '$\!N;s/\n<ip>/,ip,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed -e :a -e '$\!N;s/\n<username>/,name,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
sed -e :a -e '$\!N;s/\n<comment>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1
# Put in ,-1, when it's a minor edit.
sed -e :a -e '$\!N;s/\n<minor \>/,-1,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2
mv -f $WDIR/headers.raw2 $WDIR/headers.raw1

# remove most HTML tags (accommodates multiple-line tags)
sed -e :a -e 's/<[^>]*>//g;/</N;//ba' $WDIR/headers.raw1 > $WDIR/headers.raw
rm -f $WDIR/headers.raw1


date


exit