1 # This module is part of Refence, and is released under the terms of
2 # the GPL version 2, or any later version. See the file README and
3 # COPYING for more information.
4 # Copyright 2003 by Don Armstrong <don@donarmstrong.com>.
5 # $Id: PubMed.pm 45 2013-09-10 18:05:31Z don $
7 package Reference::Retrieve::PubMed;
11 Reference::Retrieve::PubMed -- Reference Retrieval from PubMed
15 my $reference = Reference::Retrieve::PubMed::get_reference(-pmid=>123456);
16 my @references = Reference::Retrieve::PubMed::get_reference(-query=>'John Smith[AUTHOR] AND 230[Pages]',limit=>50);
21 Uh. Retreives references from pubmed. Yeah.
31 use vars qw($REVISION $DEBUG);
35 use XML::Simple qw(:strict);
40 use Params::Validate qw(:types validate_with);
43 ($REVISION) = q$LastChangedRevision: 45 $ =~ /\$LastChangedRevision:\s+([^\s+])/;
44 $DEBUG = 0 unless defined $DEBUG;
52 my $reference = Reference::Retrieve::PubMed::get_reference(-pmid=>123456);
53 my @references = Reference::Retrieve::PubMed::get_reference(-query=>'John Smith[AUTHOR] AND 230[Pages]',-limit=>50);
54 my @references = Reference::Retrieve::PubMed::get_reference(-query=>{author=>'John Smith', pages=>'230'},-limit=>50)
58 Retrives a reference from pubmed
62 In scalar context, effectively assumes -limit=>1 and returns the
63 highest listed reference according to the order, etc. [Probably only
64 usefull with -pmid.] In list context, returns all results (or until it
69 list of arguments to select a reference or collection of references from.
75 my %options = validate_with(params => @_,
76 spec => {pubmed_site => {default => 'http://www.ncbi.nlm.nih.gov'},
77 pmid_query => {default => '/entrez/query.fcgi?cmd=Text&db=PubMed&dopt=XML&uid='},
78 search_query => {default => '/htbin-post/Entrez/query?db=m&form=4&dispmax=100&html=no&dopt=u&term='},
79 ua_agent => {default => "DA Reference::Retreive::PubMed/$REVISION"},
80 email => {default => "don+referenceretrieve$REVISION\@donarmstrong.com"},
84 my $ua = new LWP::UserAgent(agent => $options{ua_agent});
87 sub get_reference_by_pmid($;@){
88 my %options = validate_with(params => \@_,
89 spec => {pmid => {type => SCALAR|ARRAYREF,
92 pubmed_site => {default => 'http://www.ncbi.nlm.nih.gov'},
93 pmid_query => {default => '/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&rettype=full&id='},
94 search_query => {default => '/htbin-post/Entrez/query?db=m&form=4&dispmax=100&html=no&dopt=u&term='},
95 ua_agent => {default => "DA Reference::Retreive::PubMed/$REVISION"},
96 email => {default => "don+referenceretrieve$REVISION\@donarmstrong.com"},
97 useragent => {optional => 1},
101 my $pmid = $options{pmid};
104 if ($options{useragent}) {
105 $ua = $options{useragent};
108 $ua = new LWP::UserAgent(agent=>$options{ua_agent});
110 my $url = "$options{pubmed_site}$options{pmid_query}" . (ref($pmid) ? (join('&id=',@{$pmid})) : $pmid);
111 print STDERR "url: $url" if $DEBUG;
112 my $request = HTTP::Request->new('GET', $url);
113 my $response = $ua->request($request);
114 $response = $response->content;
115 print STDERR "response: $response" if $DEBUG;
117 # For some dumb reason, they send us xml with html
118 # entities. Ditch them.
119 #$response = decode_entities($response);
120 # It's even more freaking broken; they don't double encode them.
121 #$response =~ s/\>(\s|$)/>$1/gso;
122 #$response =~ s/(?:(\s)\<|<(\/))/$1<$2/gso;
123 $response =~ s/"/"/gso;
126 $response =~ s/^\s*<\?xml[^>]+>\s*//gso;
127 $response =~ s/^\s*<\!DOCTYPE[^>]+>\s*//gso;
128 # There is also a Pubmedarticleset
129 $response =~ s/^\s*<PubmedArticleSet>\s*//gso;
130 $response =~ s#</PubmedArticleSet>\s*$##gso;
132 # Add the opt so we get an array of PubMedArticle
133 $response = "<opt>$response</opt>";
135 print STDERR $response if $DEBUG;
137 # Figure out if there was an error in the search.
139 # Response should be in XML. Parse it.
140 my $xa = new XML::Simple;
142 my $ref_struct = $xa->XMLin($response, ForceArray => [ 'PubmedArticle' ],KeyAttr=>[]);
145 print STDERR Dumper($ref_struct) if $DEBUG;
146 # Handle the XML structure
148 foreach my $ref (@{$ref_struct->{PubmedArticle}}) {
149 my $reference = _create_reference_from_xml($ref,$ua);
150 if (not defined $reference) {
151 warn "Unable to create reference for $ref->{MedlineCitation}->{PMID}\n";
153 push @references, $reference;
158 return $references[0];
161 sub _create_reference_from_xml($$){
164 # Figure out what type of reference this is. We only support
165 # Journal Articles right now.
166 my $types = {'journal article'=>'article',
167 'letter' =>'article',
168 'editorial' => 'article',
169 'review' => 'article',
171 my $ref_type = undef;
172 my $reference = undef;
173 foreach my $type (keys %{$types}) {
174 if (ref($ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType}) eq 'ARRAY'){
176 @{$pubtypes}{map {lc} @{$ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType}}} =
177 (1) x @{$ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType}};
178 if ($pubtypes->{$type}) {
179 $ref_type = $types->{$type};
186 elsif (lc($ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType}) eq lc($type)) {
187 $ref_type = $types->{$type};
191 if (not defined $ref_type) {
192 warn "Unsupported PublicationType: ".(ref($ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType})?
193 join(',',@{$ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType}}):
194 $ref->{MedlineCitation}->{Article}->{PublicationTypeList}->{PublicationType});
195 print STDERR Dumper($ref) if $DEBUG;
196 $ref_type = 'article';
198 local $_ = $ref_type;
200 use Reference::Type::Article;
201 $reference = new Reference::Type::Article;
202 my $xml_mapping = {author => [ _fix_medline_authors($ref->{MedlineCitation}->{Article}->{AuthorList}) ],
203 title => [_fix_medline_title($ref->{MedlineCitation}->{Article}->{ArticleTitle})],
204 abstract => [_fix_medline_abstract($ref->{MedlineCitation}->{Article}->{Abstract}->{AbstractText})],
205 journal => [_fix_medline_journal($ref->{MedlineCitation}->{Article}->{Journal},
206 $ref->{MedlineCitation}->{Article}->{MedlineJournalInfo},
211 # pmid => $ref->{MedlineCitation}->{PMID},
212 # medline_id => $ref->{MedlineCitation}->{MedlineID},
213 volume => [_fix_medline_ditch_empty($ref->{MedlineCitation}->{Article}->{Journal}->{JournalIssue}->{Volume})],
214 date => [_fix_medline_pubdate($ref->{MedlineCitation}->{Article}->{Journal}->{JournalIssue}->{PubDate})],
215 number => [_fix_medline_ditch_empty($ref->{MedlineCitation}->{Article}->{Journal}->{JournalIssue}->{Issue})],
216 pages => [_fix_medline_pages($ref->{MedlineCitation}->{Article}->{Pagination}->{MedlinePgn})],
217 # keywords => [_fix_medline_keywords($ref->{MedlineCitation}->{MeshHeadingList},
218 # $ref->{MedlineCitation}->{ChemicalList},
220 # &_find_pubmed_links($ref->{MedlineCitation}->{PMID},$ua),
224 foreach my $reference_key (keys %{$xml_mapping}) {
225 my $method = $reference->can($reference_key);
226 die "Reference::Type::Article was unable to handle $reference_key" if not $method;
227 if (defined $xml_mapping->{$reference_key} and $method) {
228 if (ref($xml_mapping->{$reference_key})) {
229 &{$method}($reference,@{$xml_mapping->{$reference_key}});
232 &{$method}($reference,$xml_mapping->{$reference_key});
236 warn "Reference_key $reference_key was not defined or unable to handle type of key."
237 if not defined $xml_mapping->{$reference_key} and $DEBUG;
244 sub _fix_medline_title($){
251 sub _fix_medline_abstract{
252 my $abstract = shift;
254 if (ref($abstract) and ref($abstract) eq 'ARRAY') {
255 for my $element (@{$abstract}) {
256 $ret .= "\n" if length $ret;
257 $ret .= $element->{Label}.': '.$element->{content};
266 sub _fix_medline_authors($){
267 my $author_list = shift;
268 $author_list = $author_list->{Author};
270 $author_list = [$author_list] if ref($author_list) ne 'ARRAY';
271 foreach my $author (@{$author_list}) {
273 $au{first} = $author->{ForeName} if exists $author->{ForeName};
274 $au{last} = $author->{LastName} if exists $author->{LastName};
275 $au{initials} = $author->{Initials} if exists $author->{Initials};
279 return (author=>\@authors);
282 =head2 _fix_medline_journal
286 $reference->journal(_fix_medline_journal($ref->{MedlineCitation}->{Article}->{Journal},
287 $ref->{MedlineCitation}->{Article}->{MedlineJournalInfo},
292 From the medline citation informatino returns a properly formatted
293 list of information for the journal reference listing.
297 Journal information hashref
299 medline journal information hashref
305 sub _fix_medline_journal($$$;){
306 my ($journal,$medline_journal,$ua) = @_;
307 # journal takes fullname, issn, medlineabbr, pmid, and nlmuid
308 # Try to supply as much as possible.
309 # Use esearch to get pmjournalid
310 # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=journals&term=0021-9258
311 # use esummary to retreive the journalid
312 # <?xml version="1.0"?>
313 # <!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD eSearchResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSearch_020511.dtd">
317 # <RetStart>0</RetStart>
326 # <Term>0021-9258[All Fields]</Term>
327 # <Field>All Fields</Field>
330 # <Explode>Y</Explode>
332 # </TranslationStack>
335 my $ISSN = $journal->{ISSN};
337 $ISSN = $ISSN->{content};
339 my $url = qq(http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=journals&term=$ISSN);
340 print STDERR "url: $url" if $DEBUG;
341 my $request = HTTP::Request->new('GET', $url);
342 my $response = $ua->request($request);
343 $response = $response->content;
344 print STDERR "response: $response" if $DEBUG;
346 my ($journal_id) = $response =~ m#<Id>\s*(\d+)\s*</Id>#i;
348 # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=journals&id=4559
349 # <?xml version="1.0"?>
350 # <!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD eSummaryResult, 11 May 2002//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eSummary_020511.dtd">
354 # <Item Name="Title" Type="String">The Journal of biological chemistry.</Item>
355 # <Item Name="MedAbbr" Type="String">J Biol Chem</Item>
356 # <Item Name="IsoAbbr" Type="String">J. Biol. Chem.</Item>
357 # <Item Name="NlmId" Type="String">2985121R</Item>
359 # <Item Name="pISSN" Type="String">0021-9258</Item>
360 # <Item Name="eISSN" Type="String">1083-351X</Item>
361 # <Item Name="PublicationStartYear" Type="String">1905</Item>
362 # <Item Name="PublicationEndYear" Type="String"></Item>
363 # <Item Name="Publisher" Type="String">American Society for Biochemistry and Molecular Biology</Item>
364 # <Item Name="Language" Type="String">eng</Item>
366 # <Item Name="Country" Type="String">United States</Item>
370 $url = qq(http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=journals&id=$journal_id);
371 print STDERR "url: $url" if $DEBUG;
372 $request = HTTP::Request->new('GET', $url);
373 $response = $ua->request($request);
374 $response = $response->content;
375 print STDERR "response: $response" if $DEBUG;
378 while ($response =~ m{^\s*(?:(?:<id>\s*(\d+)</id>)| # Match ids
379 (?:<item\s+name=\"([^\"]+)\"\s+Type=\"String\">\s*([^<]+?)</item>))\s* # Match item Name clauses
381 if (not defined $2) {
385 $journal{lc($2)} = $3;
388 my %journal_mapping = (title => q(title),
389 medlineabbr => q(medabbr),
390 isoabbr => q(isoabbr),
394 publisher => q(publisher),
398 foreach my $key (keys %journal_mapping) {
399 push @journal_entry,($key=>$journal{$journal_mapping{$key}});
401 return @journal_entry;
408 $reference->date(_fix_medline_pubdate($ref->{MedlineCitation}->{Article}->{Journal}->{JournalIssue}->{PubDate}));
418 sub _fix_medline_pubdate($){
420 return (year=>$date->{Year},month=>$date->{Month},day=>$date->{Day}) if exists $date->{Year};
422 if (exists $date->{MedlineDate}) {
423 my ($year,$month,$day) = split /\s+/,$date->{MedlineDate};
424 return (year=>$year,month=>$month,day=>$day)
428 =head2 _fix_medline_pages
432 pages => [_fix_medline_pages($ref->{MedlineCitation}->{Article}->{Pagination}->{MedlinePgn})],
436 Returns output with a list of pages appropriate for an Article type of
441 sub _fix_medline_pages($){
442 my ($pagination) = @_;
443 my ($start,$stop) = $pagination =~ /(\d*)\s*\-\s*(\d*)/;
444 if (not defined $start) {
445 ($start) = $pagination =~ /(\d+)/
447 if ($start > $stop and defined $stop) {
448 # this must be a reduced page listing; fix it up
449 $stop+=$start - $start % 10 ** (int(log($stop)/log(10))+1);
452 push @return, (start=>$start) if defined $start and $start ne '';
453 push @return, (stop=>$stop) if defined $stop and $stop ne '';
457 sub _find_pubmed_links($$){
460 #http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
461 my $url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&cmd=llinks&id=" . (ref($pmid) ? (join('&id=',@{$pmid})) : $pmid);
462 print STDERR "url: $url" if $DEBUG;
463 my $request = HTTP::Request->new('GET', $url);
464 my $response = $ua->request($request);
465 $response = $response->content;
466 print STDERR "response: $response" if $DEBUG;
468 # Response should be in XML. Parse it.
469 my $xa = new XML::Simple;
471 my $ref_struct = $xa->XMLin($response, ForceArray => ['IdUrlSet'], KeyAttr=>[]);
474 print STDERR Dumper($ref_struct);# if $DEBUG;
475 # Rearange data around Id.
477 map {$links->{$_->{Id}}=$_->{ObjUrl}} @{$ref_struct->{LinkSet}->{IdUrlList}->{IdUrlSet}};
478 foreach my $obj_url (@{$links->{$pmid}->{ObjUrl}}) {
479 next unless $obj_url->{SubjectType} = 'publishers/providers';
480 #@links = _find_links_from_url($obj_url->{Url},$ua);
482 # Find publisher link
483 # If no publisher link, use the first aggregator link.
497 my %ids_known = (medline => 'medline_id',
502 if (exists $ref->{PubmedData}{ArticleIdList}{ArticleId}) {
503 for my $art_id (ref($ref->{PubmedData}{ArticleIdList}{ArticleId}) eq 'ARRAY' ?
504 @{$ref->{PubmedData}{ArticleIdList}{ArticleId}}:
505 ($ref->{PubmedData}{ArticleIdList}{ArticleId})) {
506 if (exists $ids_known{$art_id->{IdType}}) {
507 $ids{$ids_known{$art_id->{IdType}}} = $art_id->{content};
511 if (not exists $ids{pmid}) {
512 $ids{pmid} = $ref->{MedlineCitation}->{PMID} if defined $ref->{MedlineCitation}->{PMID};
514 if (not exists $ids{medline_id}) {
515 $ids{medline_id} = $ref->{MedlineCitation}->{MedlineID} if defined $ref->{MedlineCitation}->{MedlineID};
521 =head2 _find_links_from_url
533 sub _find_links_from_url($$){
540 sub _fix_medline_ditch_empty($){
544 if (ref($value) eq 'HASH') {
545 if (scalar keys %{$value} > 0) {
552 elsif (ref($value) eq 'ARRAY') {
553 if (scalar @{$value} > 0) {
565 return $value if defined $value;