# A script to parse the entries from Medline via PubMedand convert to BibTex # # To run: gawk -f parsepubmed.awk # initials=ABC authortype=type search_file > file.bib # where # type = lastnamefirst for names of the form MacLeod, R.S. # otherwise, we use R.S. MacLeod ## Last update: Fri Oct 9 20:38:40 2009 by Rob Macleod ## - add controls for initials ## Last update: Tue Jun 12 08:54:37 2007 by Rob Macleod ## - changed the reg exp for each keyword. Pubmed ## seems to have switched from blanks to a tab character ## Last update: Wed Jul 6 17:29:27 2005 by Rob Macleod ## - updated just slightly for new distribution on the web # LAST update: Thu Apr 4 11:26:35 2002 by Rob MacLeod # - Tim Holy (holy@pcg.wustl.edu) added a fix to the page number # conversion # Last update: Thu Nov 15 15:13:35 2001 by Rob MacLeod # - fixed it again, this time it better work! (-: # Last update: Tue Oct 30 15:39:54 2001 by Rob MacLeod # - fixed up authors to be more robust # Last update: Mon Aug 16 13:07:17 1999 by Rob MacLeod # - updated from original medline version # Last update: Sun Nov 22 01:42:39 1998 by Rob MacLeod # - unified the printing of entries and also added the feature to # parse a NOTE: entry # ###################################################################### BEGIN{ numrefs=0; numauthors = 0; if ( length(initials) < 1 ) { initials = "RSM"; } printf("Initials are %s\n", initials) > "/dev/tty"; # List of journals we recognize and their short strings # Note, to add more, just enter the string as it appears in the pubmed # string in the list below together with its abbreviation, # and then UPDATE THE VALUE of njournals. # njournals=6; journals[1] = "IEEE Trans Biomed Eng"; journals[2] = "Circ Res"; journals[3] = "Circulation"; journals[4] = "Am J Physiol"; journals[5] = "Ann Biomed Eng"; journals[6] = "J Electrocardiol"; journalstr[1] = "j-BME"; journalstr[2] = "j-CR"; journalstr[3] = "j-C"; journalstr[4] = "j-AJP"; journalstr[5] = "j-ABE"; journalstr[6] = "j-JE"; qgawk = 0; if ( index ("gawk", ARGV[0] ) ) { # printf("Running gawk hooray\n"); qgawk = 1; } } ####################################################### # Read all the fields from the medline entry # The order matters because of the way we have to read the # records. Try to match the order of the fields in the # pubmed format. ####################################################### #/VI -/{ /VI\t-/{ volumein = Readentry( " - " ); } #/IP -/{ /IP\t-/{ issuein = Readentry( " - " ); } #/DP -/{ /DP\t-/{ yearin = Readentry( " - " ); theyear = substr(yearin,1,4); # printf(" theyear is %s from yearin = %s\n", theyear, yearin ); if ( theyear < 2000 ) { shortyear = substr(yearin,3,2); } else { shortyear = theyear; } numparts = split( yearin, yearparts, " "); year = yearparts[1]; if ( numparts > 1 ) { month = yearparts[2]; } else { month = ""; } } #/TI -/{ /TI\t-/{ titlein = Readentry( " - " ); # printf("titlein = %s\n", titlein) > "/dev/tty"; } #/PG -/{ /PG\t-/{ pagesin = Readentry( " - " ); } #/AB -/{ /AB\t-/{ # Note: the argument in this next readentry is emperical and may cause # problems if the AD field is empty. notein = Readentry( "AD -" ); } #/AU -/{ /^AU\t-/{ if ( numauthors < 1 ) { authorin = Readline( "\t-" ); # Find the three-character code to use # in the key for this reference numparts = split(authorin, authorparts, " "); if ( length(authorparts[1]) > 2 ) { firstauthor = substr(authorparts[1],1,3); } else if ( length(authorparts[1]) == 2 ) { firstauthor = authorparts[1] substr(authorparts[2],1,1); } else if ( length(firstauthor) == 1 ) { firstauthor = authorparts[1] substr(authorparts[2],1,2); } numauthors = 1; } else { # printf("reading author #%d\n", numauthors+1) > "/dev/tty" authorin = authorin " ; " Readline( "\t-" ); numauthors++; } # Now get additional authors (each one has its own line # while ( /AU -/ ) { # authorin = authorin " ; " Readentry( " - " ); # } # printf("authorin is %s\n",authorin) > "/dev/tty" ; } /PT\t-/{ dtype = Readentry( " - " ); if ( index(toupper(dtype), "JOURNAL" ) ) { atype = "Article"; } else if ( index(toupper(dtype), "PROCEEDING" ) ) { atype = "InProceedings"; } else if ( index(toupper(dtype), "REVIEW" ) ) { atype = "Article"; } else { printf(" With a paper type (PT) value of %s we do not know"\ " what to do??\n",\ dtype ) > "/dev/tty"; if ( length(atype) < 2 ) { atype = "ReadUnknown"; } else { printf(" So we leave the type set to %s\n", atype) > "/dev/tty"; } } } #/TA -/{ /TA\t-/{ journalin = Readentry( " - " ); } ####################################################### # Now process and convert what we have read ####################################################### /SO\t-/{ #/SO -/{ printf(" Found end of a record so generate output\n") > "/dev/tty"; numrefs++; # Split out the authors names and then add the title # printf("\n@Article{RSM:%3s%2.2d,\n", firstauthor,shortyear); printf("\n@Article{%s:%3s%2.2d,\n", initials,firstauthor,shortyear); authorstring = FormatAuthors( authorin ); PrintEntry( authorstring ); PrintEntry( "title " titlein ); # Now the source journal = journalin; journal = CleanJournal( journal ); # PrintEntry( "journal " journal ); PrintEntryNQ( "journal " journal ); PrintEntry( "year " year ); if ( length( month ) > 0 ) { PrintEntry( "month " month ); } if ( length( volumein ) > 0 ) { PrintEntry( "volume " volumein ); } if ( length( issuein ) > 0 ) { PrintEntry( "number " issuein ); } # See if we have to add in explicit digits to the second page number # for the case of, for example, 123-45, which we want to be 123--145.. # Fixes by TEH: check to see that it's a range rather than a single page, # and then just copy the extra digits from the first page to the final page. # split( pagesin, pagenums, "-" ); if (split( pagesin, pagenums, "-" ) > 1) { lendiff = length(pagenums[1]) - length(pagenums[2]); if (lendiff > 0) { pagesin = pagenums[1] "--" substr(pagenums[1],1,lendiff) pagenums[2]; pagesin = pagenums[1] "--" substr(pagenums[1],1,lendiff) pagenums[2]; # This old code is commented out because of TIm Holy's fix. # if (pagenums[1] - pagenums[2] > 99 ) { # lopage = pagenums[1] % 100; # hipage = pagenums[1] - lopage; # pagesin = pagenums[1] "--" hipage+pagenums[2]; } else { gsub(/-/, "--", pagesin ); } } PrintEntry( "pages " pagesin ); # See if we have found a note and convert any troublesome characters: if ( length(notein) > 0 ) { inentry = sprintf(" robnote %s", notein); PrintEntry( inentry ); } else { # Wrap it up with a blank robnote field printf(" robnote = \"\",\n"); } # Add a bibdate entry if ( qgawk ) { thedate = strftime("%c"); # printf("The date is %s\n", thedate); PrintEntry( "bibdate " thedate ); } else { PrintEntry( "bibdate " "unknown" ); } # Close the entry and reset some things to empty printf("}\n"); volumein = ""; issuein = ""; yearin = ""; notein = ""; authorin = ""; numauthors = 0; # Report to the user # printf("Done with: @Article{RSM:%3s%2.2d}\n", \ # firstauthor,shortyear) > "/dev/tty"; printf("Done with: @Article{%s:%3s%2.2d}\n", \ initials,firstauthor,shortyear) > "/dev/tty"; } END{ # Write out the strings for journal names for (jnum=1; jnum<=njournals; jnum++) { printf("@String{%s = \"%s\"}\n", journalstr[jnum], journals[jnum]); } } ###################################################################### function Readline( separator ) { # Read in the next line of the file and assume everything of interest # first on that one line. Then separate out the preamble from the # content by means of the separator and return the good stuff # Find the place to start taking valid text in the current line. schar = index($0,separator); entryin = substr( $0, schar+4); # printf("readline found %s\n",entryin) > "/dev/tty" # Now get rid of blanks and | characters entryin = Packentry( entryin); return entryin; } ###################################################################### function Readentry( nextkey ) { # Read in the next entry until either the key "nextkey" is hit, # or we get to a blank line # We assume we are sitting in a line with a valid field name, which in # pubmed means that there is a name followed by " - " . # Find the place to start taking valid text in the current line. schar = index($0,"\t-"); entryin = substr( $0, schar+4); # printf("First entryin is %s\n",entryin) > "/dev/tty" # Now loop through the lines until we have a match with nextkey or hit # an empty line, or the first key having "-" as its fifth element # or the second field is "-" getline; while( index($0, nextkey) == 0 && length($0) > 1 && \ index($1, "-") != 5 && $2 != "-" ) { entryin = entryin " " $0; getline; } # Now get rid of blanks and | characters entryin = Packentry( entryin); return entryin; } ###################################################################### function Packentry( entryin ) { # Clean up and pack an entry, get rid of funky characters or make the # proper LaTeX special characters gsub(/\|/, " ", entryin ); gsub(/ /, " ", entryin ); gsub(/\"/, "", entryin ); gsub(/%/, "\\%", entryin); gsub(/\$/, "\\$", entryin); return entryin; } ###################################################################### function FormatAuthors( authorin ) { # Format the author string for BibTeX by splitting the string up and then # separating it into names and initials numauthors = split( authorin, authors, ";"); authorstring = sprintf("author %s", FixAuthor(authors[1])); for (i=2; i<=numauthors; i++) { authorstring = authorstring sprintf(" and %s", FixAuthor(authors[i]) ); } return authorstring; } ###################################################################### function FixAuthor( author ) { # Do the actual fixing and formating of a single author name. nparts = split( author, parts, " " ); # Assume all but the last part are names that belong together, in the # same order they arrived in. newauthor = ""; j = nparts; if ( authortype == "lastnamefirst" ) { # Use the "lastname, firstnames" format here, i.e., MacLeod, R.S. for ( j=1; j%s<\n", journal); jlen = length(journal); rphit = 0; for (cnum=jlen; cnum>1; cnum-- ) { if ( index(substr(journal,cnum,1),")") ) { rphit = cnum; # printf(" Found match at %d for ) at end of line %s\n", rphit, journal); break; } } if ( rphit ) { for (cnum=rphit; cnum>1; cnum-- ) { # printf("Checking character %d = %s\n", cnum,substr(journal,cnum,1)); if ( (lphit = index(substr(journal,cnum,1),"(")) ) { # printf(" FOund matching left paren at %d\n", cnum); newjournal = substr(journal,1,cnum-1); break; } } } else { newjournal = journal; } newjournal = "\"" newjournal "\""; # See if we recognize the journal name and have a string for it # already, e.g, IEEE Trans Biomed Eng = j-BME for ( i=1; i<=njournals; i++ ){ if ( index( newjournal, journals[i]) > 0 ) { newjournal = journalstr[i]; } } return newjournal; } ###################################################################### function PrintEntry( inentry ) { # Print a bibtex entry with decent word wrapping and formatting # We impose spacing between keyword and text and make sure the "=" is # in the right place. # Find the number of words and get them all in the thewords array numwords = split( inentry, thewords ); maxlen = 75; offset = 17; linenum = 0; wordnum = 0; # Loop through all the words in the string and add just enough to get # the right line length. while( wordnum < numwords ) { oneline=""; # For the first line, print the field name and = sign if ( linenum == 0 ) { oneline = sprintf(" %s = ", thewords[1] ); if ( index( thewords[2], "=" ) == 0 ) { wordnum = 1; } else { wordnum = 2; } numspaces = offset - length(oneline); for (i=0; i< numspaces; i++ ) { oneline = oneline " "; } oneline = oneline "\""; } else { printf("\n"); oneline = sprintf(" "); } # Now build the string from the words. while ( length( oneline) + length( thewords[wordnum+1] ) <= maxlen \ && wordnum < numwords) { oneline = sprintf("%s%s ", oneline, thewords[wordnum+1] ); wordnum++; } # Print out the string -- note the gymnastics to get rid of the blank at # the end of each line. printf("%s", substr(oneline,1,length(oneline)-1)); linenum++; } printf("\",\n", oneline); return; } ###################################################################### function PrintEntryNQ( inentry ) { # No-quotation marks version of PrintEntry # Print a bibtex entry with decent word wrapping and formatting # We impose spacing between keyword and text and make sure the "=" is # in the right place. # Find the number of words and get them all in the thewords array numwords = split( inentry, thewords ); maxlen = 75; offset = 17; linenum = 0; wordnum = 0; # Loop through all the words in the string and add just enough to get # the right line length. while( wordnum < numwords ) { oneline=""; # For the first line, print the field name and = sign if ( linenum == 0 ) { oneline = sprintf(" %s = ", thewords[1] ); if ( index( thewords[2], "=" ) == 0 ) { wordnum = 1; } else { wordnum = 2; } numspaces = offset - length(oneline); for (i=0; i< numspaces; i++ ) { oneline = oneline " "; } # oneline = oneline "\""; } else { printf("\n"); oneline = sprintf(" "); } # Now build the string from the words. while ( length( oneline) + length( thewords[wordnum+1] ) <= maxlen \ && wordnum < numwords) { oneline = sprintf("%s%s ", oneline, thewords[wordnum+1] ); wordnum++; } # Print out the string -- note the gymnastics to get rid of the blank at # the end of each line. printf("%s", substr(oneline,1,length(oneline)-1)); linenum++; } # printf("\",\n", oneline); printf(",\n"); return; }