Skip to content
Commits on Source (59)
#!/bin/sh
while [ $# -gt 0 ]
do
case "$1" in
-path )
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ "$#" -gt 0 ]
then
target="$1"
MASTER=$(cd "$target" && pwd)
shift
else
if [ -z "${EDIRECT_PUBMED_MASTER}" ]
then
echo "Must supply path to master archive area or set EDIRECT_PUBMED_MASTER environment variable"
exit 1
else
MASTER="${EDIRECT_PUBMED_MASTER}"
MASTER=${MASTER%/}
fi
fi
while [ $# -gt 0 ]
do
case "$1" in
-temp | -work | -working )
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ "$#" -gt 0 ]
then
working="$1"
WORKING=$(cd "$working" && pwd)
shift
else
if [ -z "${EDIRECT_PUBMED_WORKING}" ]
then
WORKING=${MASTER}
else
WORKING="${EDIRECT_PUBMED_WORKING}"
WORKING=${WORKING%/}
fi
fi
for dir in Archive Postings
do
mkdir -p "$MASTER/$dir"
done
for dir in Current Indexed Inverted Merged Pubmed
do
mkdir -p "$WORKING/$dir"
done
if [ ! -f "$MASTER/Archive/CACHEDIR.TAG" ]
then
pm-prepare "$MASTER/Archive"
fi
echo "Downloading PubMed Files"
cd "$WORKING/Pubmed"
download-pubmed baseline updatefiles
echo ""
echo "Populating PubMed Archive"
pm-stash "$MASTER/Archive"
echo ""
echo "Refreshing Versioned Records"
pm-refresh "$MASTER/Archive"
echo ""
echo 18810966 |
fetch-pubmed -path "$MASTER/Archive" |
xtract -pattern Author -if Affiliation -contains Medicine \
-pfx "Archive is " -element Initials
echo ""
#!/usr/bin/perl -w
# Usage: asp-ls PATH
#!/usr/bin/env perl
use warnings;
use strict;
use Net::FTP;
# Usage: asp-ls PATH
my $server = "ftp.ncbi.nlm.nih.gov";
my $dir = shift;
my $ftp = new Net::FTP($server, Passive => 1)
......
#!/bin/sh
cd "$GOPATH"
go get -u github.com/fatih/color
go get -u github.com/fiam/gounidecode/unidecode
go get -u github.com/klauspost/cpuid
go get -u github.com/pbnjay/memory
go get -u github.com/surgebase/porter2
go get -u golang.org/x/text/runes
go get -u golang.org/x/text/transform
go get -u golang.org/x/text/unicode/norm
cd "$GOPATH/src/xtract"
go build -o xtract xtract.go common.go
go build -o rchive rchive.go common.go
This diff is collapsed.
......@@ -5,21 +5,16 @@ useasp=`has-asp`
filter() {
while read fl
do
base=${fl%.xml.gz}
if [ -f "$fl" ]
then
continue
fi
if [ -f "$base.snt" ]
then
continue
fi
echo "$fl"
done
}
download() {
if [ "$useasp" == "true" ]
if [ "$useasp" = "true" ]
then
asp-ls "pubmed/$1" |
grep -v ".md5" | grep "xml.gz" |
......@@ -35,8 +30,16 @@ download() {
if [ "$#" -eq 0 ]
then
echo "Must indicate either baseline or updatefiles"
exit 1
download "baseline"
if [ $? -ne 0 ]
then
download "baseline"
fi
download "updatefiles"
if [ $? -ne 0 ]
then
download "updatefiles"
fi
fi
while [ "$#" -gt 0 ]
......
......@@ -10,7 +10,7 @@ do
done
download() {
if [ "$useasp" == "true" ]
if [ "$useasp" = "true" ]
then
asp-ls ncbi-asn1 |
grep "aso.gz" | eval "$filt" |
......
#!/bin/sh
dir=`dirname "$0"`
case "`uname -s`" in
CYGWIN_NT*)
# Use a negative match here because the shell treats 0 as success.
if perl -e 'exit $^O !~ /^MSWin/'; then
dir=`cygpath -w "$dir"`
fi
;;
esac
exec perl "$dir"/edirect.pl -blast "$@"
This diff is collapsed.
#!/usr/bin/perl
#!/usr/bin/env perl
# ===========================================================================
#
......
#!/bin/sh
target="$1"
flag="none"
if [ "$#" -eq 0 ]
then
echo "Must supply path to archive files"
while [ $# -gt 0 ]
do
case "$1" in
-strict )
flag="strict"
shift
;;
-mixed )
flag="mixed"
shift
;;
-path )
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
fi
;;
* )
break
;;
esac
done
if [ "$#" -gt 1 ]
if [ "$#" -gt 0 ]
then
argument="$1"
target=$(cd "$argument" && pwd)
target=${target%/}
case "$target" in
*/Archive ) ;;
* ) target=$target/Archive ;;
esac
else
if [ -z "${EDIRECT_PUBMED_MASTER}" ]
then
flag="$1"
target="$2"
echo "Must supply path to archive files or set EDIRECT_PUBMED_MASTER environment variable"
exit 1
else
MASTER="${EDIRECT_PUBMED_MASTER}"
MASTER=${MASTER%/}
target="$MASTER/Archive"
fi
fi
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" == "CYGWIN_NT" -a -x /bin/cygpath ]
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
target=`cygpath -w "$target"`
fi
xtract -archive "$target" -gzip -flag "$flag" \
target=${target%/}
echo '<?xml version="1.0" encoding="UTF-8"?>'
echo '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st June 2018//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd">'
rchive -gzip -flag "$flag" -fetch "$target" \
-head "<PubmedArticleSet>" -tail "</PubmedArticleSet>"
#!/usr/bin/perl -w
# Usage: ftp-cp SERVER PATH FILE...
#!/usr/bin/env perl
use warnings;
use strict;
use Net::FTP;
# Usage: ftp-cp SERVER PATH FILE...
my $server = shift;
my $dir = shift;
my $ftp = new Net::FTP($server, Passive => 1)
......
#!/usr/bin/perl -w
# Usage: ftp-ls SERVER PATH
#!/usr/bin/env perl
use warnings;
use strict;
use Net::FTP;
# Usage: ftp-ls SERVER PATH
my $server = shift;
my $dir = shift;
my $ftp = new Net::FTP($server, Passive => 1)
......
#!/usr/bin/perl
#!/usr/bin/env perl
# ===========================================================================
#
......
#!/bin/sh
while [ $# -gt 0 ]
do
case "$1" in
-path )
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ "$#" -gt 0 ]
then
target="$1"
MASTER=$(cd "$target" && pwd)
shift
else
if [ -z "${EDIRECT_PUBMED_MASTER}" ]
then
echo "Must supply path to master archive area or set EDIRECT_PUBMED_MASTER environment variable"
exit 1
else
MASTER="${EDIRECT_PUBMED_MASTER}"
MASTER=${MASTER%/}
fi
fi
while [ $# -gt 0 ]
do
case "$1" in
-temp | -work | -working )
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ "$#" -gt 0 ]
then
working="$1"
WORKING=$(cd "$working" && pwd)
shift
else
if [ -z "${EDIRECT_PUBMED_WORKING}" ]
then
WORKING=${MASTER}
else
WORKING="${EDIRECT_PUBMED_WORKING}"
WORKING=${WORKING%/}
fi
fi
for dir in Archive Postings
do
mkdir -p "$MASTER/$dir"
done
for dir in Current Indexed Inverted Merged Pubmed
do
mkdir -p "$WORKING/$dir"
done
if [ ! -f "$MASTER/Archive/CACHEDIR.TAG" ]
then
pm-prepare "$MASTER/Archive"
fi
seconds_start=$(date "+%s")
echo "Downloading PubMed Files"
cd "$WORKING/Pubmed"
download-pubmed baseline updatefiles
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
DWN=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Populating PubMed Archive"
pm-stash "$MASTER/Archive"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
POP=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Refreshing Versioned Records"
pm-refresh "$MASTER/Archive"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
REF=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Collecting PubMed Records"
pm-current "$WORKING/Current" "$MASTER/Archive"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
COL=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Indexing PubMed Records"
cd "$WORKING/Current"
pm-index "$WORKING/Indexed"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
IDX=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Inverting PubMed Indices"
cd "$WORKING/Indexed"
pm-invert "$WORKING/Inverted"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
INV=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Merging Inverted Indices"
cd "$WORKING/Inverted"
pm-merge "$WORKING/Merged"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
MRG=$seconds
echo ""
seconds_start=$(date "+%s")
echo "Producing Postings Files"
cd "$WORKING/Merged"
pm-promote "$MASTER/Postings"
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
PST=$seconds
echo ""
echo "DWN $DWN seconds"
echo "POP $POP seconds"
echo "REF $REF seconds"
echo "COL $COL seconds"
echo "IDX $IDX seconds"
echo "INV $INV seconds"
echo "MRG $MRG seconds"
echo "PST $PST seconds"
echo ""
phrase-search -path "$MASTER/Postings" -query "mapping of spatio-temporal pollution status AND 2008 [YEAR]" |
fetch-pubmed -path "$MASTER/Archive" |
xtract -pattern Author -if Affiliation -contains Medicine \
-pfx "Archive and Index are " -element Initials
echo ""
#!/bin/bash -norc
# Usage: intersect-uid-lists FILE1 FILE2
comm -12 <(sort -f "$1") <(sort -f "$2") | sort -n
#!/bin/sh
target=""
mode="query"
debug=false
while [ $# -gt 0 ]
do
case "$1" in
-h | -help | --help )
mode=help
break
;;
-debug )
debug=true
shift
;;
-path | -master )
target=$2
shift
shift
;;
-count )
mode="count"
shift
;;
-counts )
mode="counts"
shift
;;
-countr )
mode="countr"
shift
;;
-countp )
mode="countp"
shift
;;
-query | -phrase )
mode="query"
shift
;;
-search )
mode="search"
shift
;;
-exact )
mode="exact"
shift
;;
-mock )
mode="mock"
shift
;;
-mocks )
mode="mocks"
shift
;;
-mockx )
mode="mockx"
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ $mode = "help" ]
then
cat <<EOF
USAGE: $0
[-path path_to_pubmed_master]
-count | -counts | -search | -exact | [-query]
query arguments
EXAMPLE: local-phrase-search -query catabolite repress* AND protease inhibit*
EOF
exit
fi
if [ -z "$target" ]
then
if [ -z "${EDIRECT_PUBMED_MASTER}" ]
then
echo "Must supply path to postings files or set EDIRECT_PUBMED_MASTER environment variable"
exit 1
else
MASTER="${EDIRECT_PUBMED_MASTER}"
MASTER=${MASTER%/}
target="$MASTER/Postings"
fi
else
argument="$target"
target=$(cd "$argument" && pwd)
target=${target%/}
case "$target" in
*/Postings ) ;;
* ) target=$target/Postings ;;
esac
fi
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
target=`cygpath -w "$target"`
fi
target=${target%/}
if [ "$debug" = true ]
then
echo "mode: $mode, path: '$target', args: '$*'"
exit
fi
case "$mode" in
count )
rchive -path "$target" -count "$*"
;;
counts )
rchive -path "$target" -counts "$*"
;;
countr )
rchive -path "$target" -countr "$*"
;;
countp )
rchive -path "$target" -countp "$*"
;;
query )
rchive -path "$target" -query "$*"
;;
search )
rchive -path "$target" -search "$*"
;;
exact )
rchive -path "$target" -exact "$*"
;;
mock )
rchive -path "$target" -mock "$*"
;;
mocks )
rchive -path "$target" -mocks "$*"
;;
mockx )
rchive -path "$target" -mockx "$*"
;;
esac
This diff is collapsed.
#!/bin/sh
target=""
mode="query"
debug=false
while [ $# -gt 0 ]
do
case "$1" in
-h | -help | --help )
mode=help
break
;;
-debug )
debug=true
shift
;;
-path | -master )
target=$2
shift
shift
;;
-count )
mode="count"
shift
;;
-counts )
mode="counts"
shift
;;
-countr )
mode="countr"
shift
;;
-countp )
mode="countp"
shift
;;
-query | -phrase )
mode="query"
shift
;;
-search )
mode="search"
shift
;;
-exact )
mode="exact"
shift
;;
-* )
exec >&2
echo "$0: Unrecognized option $1"
exit 1
;;
* )
break
;;
esac
done
if [ $mode = "help" ]
then
cat <<EOF
USAGE: $0
[-path path_to_pubmed_master]
-count | -counts | -search | -exact | [-query]
query arguments
EXAMPLE: phrase-search -query catabolite repress* AND protease inhibit*
EOF
exit
fi
if [ -z "$target" ]
then
if [ -z "${EDIRECT_PUBMED_MASTER}" ]
then
echo "Must supply path to postings files or set EDIRECT_PUBMED_MASTER environment variable"
exit 1
else
MASTER="${EDIRECT_PUBMED_MASTER}"
MASTER=${MASTER%/}
target="$MASTER/Postings"
fi
else
argument="$target"
target=$(cd "$argument" && pwd)
target=${target%/}
case "$target" in
*/Postings ) ;;
* ) target=$target/Postings ;;
esac
fi
osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
then
target=`cygpath -w "$target"`
fi
target=${target%/}
if [ "$debug" = true ]
then
echo "mode: $mode, path: '$target', args: '$*'"
exit
fi
case "$mode" in
count )
rchive -path "$target" -count "$*"
;;
counts )
rchive -path "$target" -counts "$*"
;;
countr )
rchive -path "$target" -countr "$*"
;;
countp )
rchive -path "$target" -countp "$*"
;;
query )
rchive -path "$target" -query "$*"
;;
search )
rchive -path "$target" -search "$*"
;;
exact )
rchive -path "$target" -exact "$*"
;;
esac
#!/bin/sh
if [ "$#" -eq 0 ]
then
echo "Must supply path for cleaned files"
exit 1
fi
target="$1"
target=${target%/}
for fl in *.xml.gz
do
base=${fl%.xml.gz}
if [ -f "$target/$base.xml.gz" ]
then
continue
fi
echo "$base"
gunzip -c "$fl" |
xtract -mixed -format flush |
gzip > "$target/$base.xml.gz"
done
#!/bin/sh
if [ "$#" -eq 0 ]
then
echo "Must supply path for current files"
exit 1
fi
target="$1"
shift
target=${target%/}
if [ "$#" -eq 0 ]
then
echo "Must supply path for archive files"
exit 1
fi
archive="$1"
shift
archive=${archive%/}
find "$target" -name "*.xml.gz" -delete
fr=0
chunk_size=250000
if [ -n "${EDIRECT_CHUNK_SIZE}" ]
then
chunk_size="${EDIRECT_CHUNK_SIZE}"
fi
to=$((chunk_size - 1))
loop_max=$((50000000 / chunk_size))
seq 1 $((loop_max)) | while read n
do
base=$(printf pubmed%03d $n)
if [ -f "$target/$base.xml.gz" ]
then
fr=$((fr + chunk_size))
to=$((to + chunk_size))
continue
fi
echo "$base XML"
seconds_start=$(date "+%s")
seq -f "%0.f" $fr $to | stream-pubmed -path "$archive" > "$target/$base.xml.gz"
fr=$((fr + chunk_size))
to=$((to + chunk_size))
seconds_end=$(date "+%s")
seconds=$((seconds_end - seconds_start))
echo "$seconds seconds"
fsize=$(wc -c <"$target/$base.xml.gz")
if [ "$fsize" -le 300 ]
then
rm "$target/$base.xml.gz"
exit 0
fi
sleep 1
done