.org/home.action // Computational Biology "compbiol" // Genetics "genetics" // Pathogens "pathogens" // Neglected Tropical Diseases "ntds" // --------------------------------------------------------- // To run via command line, type "php fetch_plos_area.txt -j " // where is one of the following: {compbiol, genetics, pathogens, ntds} // PHP is available from www.php.net, and is bundled with the Apache web server. // --------------------------------------------------------- // Script creates the following files and directories: // 1 file per journal: _archive_YYYY-mo-dd_HH-mi.html // 1 directory per issue: _vNN.iNN // issue directory contains: // 1 table of contents file: _vNN.iNN_toc.html // 1 file per article: p_NNNNNNN.xml // --------------------------------------------------------- // Written by Mitzi Morris (mitzi@panix.com) // 3 December 2008 // // // based on script fetchplos // Written by Fred Howell (fwh at inf.ed.ac.uk) // http://www.neurogems.org/fetchplos // --------------------------------------------------------- // Available under the creative commons attribution license: // (http://creativecommons.org/licenses/by/2.5/) // Feel free to use, modify and redistribute this script. // --------------------------------------------------------- // --------------------------------------------------------- // Extracts set of issues from html for genetics archive. // param: $archiveTxt - a string containing the archive page html // returns: an array of strings, e.g. "v01.i01" ... "vNN.iNN" // targets lines containing url for table of contents for each issue // match string: "issue.pgen" // --------------------------------------------------------- function archiveToIssues($archiveTxt) { $temp = array(); $lines = split("\n", $archiveTxt); for ($i=0; $i"); $start = $end - 8; $issue = substr($lines[$i], $start, 7); $temp[$issue] = $issue; } } $ret = array_keys($temp); return $ret; } // --------------------------------------------------------- // Extracts list of paper IDs from table of contents page. // param: $tocTxt - a string containing html for table of contents // returns: an array of strings (e.g. "0030024", "0030025") // targets lines containing links to journal articles // match string: " one of {\"compbiol\" \"genetics\" \"pathogens\" \"ntds\"}\n"; $jflag = $argv[1]; $jname = $argv[2]; if ($argv[1] !== "-j" || $jname == false ) { print "$usage\n"; exit(); } if (!($jname == "compbiol" || $jname == "genetics" || $jname !== "pathogens" || $jname !== "ntds")) { print "$usage\n"; exit(); } if ($jname == "compbiol") $jnm = "cbi"; if ($jname == "genetics") $jnm = "gen"; if ($jname == "pathogens") $jnm = "pat"; if ($jname == "ntds") $jnm = "ntd"; // URL to download a paper in XML format. // e.g.: "http://www.plosgenetics.org/article/fetchObjectAttachment.action?uri=info:doi/10.1371/journal.pgen.0040021&representation=XML" $articleUrlPrefix = "http://www.plos".$jname.".org/article/fetchObjectAttachment.action?uri=info:doi/10.1371/journal.p".$jnm."."; // the table of contents page for one issue $tocUrlPrefix = "http://www.plos".$jname.".org/article/browseIssue.action?issue=info:doi/10.1371/issue.p".$jnm."."; // the journal archive page which lists all issues $archiveUrl = "http://www.plos".$jname.".org/article/browseVolume.action?field=volume"; // fetch archive page, identify issues to date $ts = date('Y-m-d_H-i'); print "fetch archive url: $archiveUrl\n"; $archiveTxt = file_get_contents($archiveUrl); file_put_contents($jname."_archive_".$ts.".html", $archiveTxt); $issues = archiveToIssues($archiveTxt); print "found ".count($issues)." issues\n"; // loop: for each issue mkdir, get contents for ($i=0; $i