Skip to content

Commit 6da64d5

Browse files
committed
out-link counts without redirect pages
1 parent 6d3e02f commit 6da64d5

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

scripts/src/main/bash/oneliners.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,4 +123,4 @@ ls -1 */*/*-page-links*.{nt,ttl}.gz | grep -v -E 'en-uris|unredirected' > page-l
123123

124124
# generate out-link count
125125
# (the first sort is not really necessary - our triples are already grouped by subject, and grouping is enough for uniq - but it's nicer that the in-link and out-link counts are sorted similarly)
126-
( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i; gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | sort | uniq -c | sort -k 1,1 -n -s -r | awk '{print $2" <http://dbpedia.org/ontology/wikiPageOutLinkCount> \""$1"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> page-out-link-counts.log & )
126+
( export LC_ALL=C ; cat page-links.txt | while read i ; do echo $i ; join -v 1 <(gzip -d < $i | grep -v '^#' | cut -d ' ' -f 1 | sort | uniq -c | awk '{print $2" "$1}') <(gzip -d < ${i/page-links/redirects} | grep -v '^#' | cut -d ' ' -f 1 | sort) | sort -k 2,2 -n -s -r | awk '{print $1" <http://dbpedia.org/ontology/wikiPageOutLinkCount> \""$2"\"^^<http://www.w3.org/2001/XMLSchema#integer> ." }' | gzip -c > ${i/page-links/page-out-link-counts} ; done &> page-out-link-counts.log & )

0 commit comments

Comments
 (0)