1 |
#!/usr/bin/perl
|
2 |
|
3 |
# PREP (Perl RegExps for Pubmed) is a script that allows the use of
|
4 |
# Perl regexs in the searching of Pubmed records, providing the ability to search
|
5 |
# records for textual patterns as well as keywords
|
6 |
|
7 |
# Copyright 2005- Christopher M. Frenz
|
8 |
# This script is free sofware it may be used, copied, redistributed, and/or modified
|
9 |
# under the terms laid forth in the Perl Artisic License
|
10 |
|
11 |
# Please cite this script in any publication in which literature cited within the
|
12 |
# publication was located using the PREP.pl script.
|
13 |
|
14 |
# Usage: perl PREPv1-0.pl PubmedQueryTerms
|
15 |
|
16 |
# Usage of this script requires the LWP and XML::LibXML modules are installed
|
17 |
use LWP;
|
18 |
use XML::LibXML; #Version 1.58 used for development and testing
|
19 |
|
20 |
# Change the variable below to set the text pattern that Perl
|
21 |
# will seek to match in the returned results
|
22 |
my $regex='[ARNDCEQGHILKMFPSTWYV]\d+[ARNDCEQGHILKMFPSTWYV]'; |
23 |
|
24 |
my $request;
|
25 |
my $response;
|
26 |
my $query;
|
27 |
|
28 |
# Concatenates arguments passed to script to form Pubmed query
|
29 |
$query=join(" ", @ARGV);
|
30 |
|
31 |
# Creates the URL to search Pubmed
|
32 |
my $baseurl="http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?";
|
33 |
my $url=$baseurl . "db=Pubmed&retmax=1&usehistory=y&term=" . $query;
|
34 |
|
35 |
|
36 |
# Searches Pubmed and Returns the number of results
|
37 |
# as well as the session information needed for results retrieval
|
38 |
$request=LWP::UserAgent->new();
|
39 |
$response=$request->get($url);
|
40 |
my $results= $response->content;
|
41 |
die unless $response->is_success;
|
42 |
print "PubMed Search Results \n";
|
43 |
$results=~/<Count>(\d+)<\/Count>/;
|
44 |
my $NumAbstracts=$1;
|
45 |
$results=~/<QueryKey>(\d+)<\/QueryKey>/;
|
46 |
my $QueryKey=$1;
|
47 |
$results=~/<WebEnv>(.*?)<\/WebEnv>/;
|
48 |
my $WebEnv=$1;
|
49 |
print "$NumAbstracts are Available \n";
|
50 |
print "Query Key= $QueryKey \n";
|
51 |
print "WebEnv= $WebEnv \n";
|
52 |
|
53 |
# Opens a file for output
|
54 |
open(OFile, ">PREPout.html");
|
55 |
|
56 |
my $parser=XML::LibXML->new;
|
57 |
|
58 |
my $retmax=500; #Number of records to be retrieved per request-Max 500
|
59 |
my $retstart=0; #Record number to start retreival from
|
60 |
|
61 |
# Creates the URL needed to retrieve results
|
62 |
$baseurl="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"; |
63 |
my $url2="http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids="; |
64 |
|
65 |
my $Count=0;
|
66 |
# Retreives results in XML format
|
67 |
for($retstart=0;$retstart<=$NumAbstracts;$retstart+=$retmax){
|
68 |
print "Processing record # $retstart \n";
|
69 |
$url=$baseurl . "rettype=abstract&retmode=xml&retstart=$retstart&retmax=$retmax&db=Pubmed&query_key=$QueryKey&WebEnv=$WebEnv";
|
70 |
|
71 |
$response=$request->get($url);
|
72 |
$results=$response->content;
|
73 |
die unless $response->is_success;
|
74 |
|
75 |
# Uses a DOM based XML parser to process returned results
|
76 |
my $domtree=$parser->parse_string($results);
|
77 |
@Records=$domtree->getElementsByTagName("PubmedArticle"); |
78 |
my $i=0;
|
79 |
foreach(@Records){ |
80 |
# Extracts element data for regex processing and output formatting |
81 |
$titles=$Records[$i]->getElementsByTagName("ArticleTitle"); |
82 |
$journals=$Records[$i]->getElementsByTagName("MedlineTA"); |
83 |
$volumes=$Records[$i]->getElementsByTagName("Volume"); |
84 |
$pgnums=$Records[$i]->getElementsByTagName("MedlinePgn"); |
85 |
$abstracts=$Records[$i]->getElementsByTagName("AbstractText"); |
86 |
$IDS=$Records[$i]->getElementsByTagName("PMID"); |
87 |
|
88 |
|
89 |
# Processes title and abstract for pattern match and if a match occurs
|
90 |
# data is written to output
|
91 |
if($titles=~/($regex)/ or $abstracts=~/($regex)/){
|
92 |
print OFile "<h1>Pattern Match: $1 </h1>\n";
|
93 |
print OFile "<h3><a href=\"$url2$IDS\">$titles </a></h3> \n";
|
94 |
print OFile "<p>$journals $volumes, $pgnums </p>\n";
|
95 |
print OFile "<p>$abstracts </p>\n\n";
|
96 |
$Count=$Count+1;
|
97 |
}
|
98 |
$i=$i+1;
|
99 |
}
|
100 |
}
|
101 |
close OFile;
|
102 |
print "$Count records matched the pattern";
|