一、準備
1. 安裝 pycharm(或 visual studio code)
2. 安裝pip、biopython、openpyxl(本文章使用2.4.9版)等套件
3. 到 NCBI網站 註冊一個帳號
二、建置環境
1. 使用 pip 安裝 biopython 套件
sudo pip install biopython
2. 使用安裝 pip 安裝 openpyxl 套件
sudo pip install openpyxl==2.4.9
三、開始來寫程式吧
1. 打開你的編輯器,先加入會用到的函數
# -*- coding: utf-8 -*-
from Bio import Entrez
2. 再來寫一個class
#指定預設使用的 Email
pubmedEmail = "xxx@example.com"
#搜尋 PMID
def search(query):
Entrez.email = pubmedEmail
handle = Entrez.esearch(
db='pubmed', #指定要搜尋的資料庫
sort='relevance',
retmax='1000000', #回傳最大值
retmode='xml', #回傳格式
term=query, #搜尋的語句
api_key="xxxx" #NCBI申請的api key
)
results = Entrez.read(handle)
return results
#下載 PMID 資料
def fetch_details(id_list):
#抓出來的 pmid 使用 array
ids = ','.join(id_list)
Entrez.email = pubmedEmail
handle = Entrez.efetch(
db='pubmed', #指定要搜尋的資料庫
retmode='xml', #回傳格式
id=ids,
api_key="xxx" #NCBI申請的api key
)
results = Entrez.read(handle)
return results
3. 然後就用class來抓取資料吧
if __name__ == '__main__':
# 新增 Excel檔
workbook = Workbook()
worksheet_prepare = workbook.active
worksheet_prepare.title = "Publication"
worksheet_publication.append(["Publication ID", "Author", "Publication date", "Journal", "Title", "Abstract", "Author info", "Research area", "Keyword"])
worksheet_publication_owner = workbook.create_sheet(title="Publication Owner")
worksheet_publication_owner.append(["Publication ID", "Email", "Name", "Country"])
#設定一個 query 語句
query = '(hmgb1) AND ("2014/01/01"[Date - Publication] : "3000"[Date - Publication])'
#抓取 PMID
results = search(query)
#取出 PMID
id_list = results['IdList']
#抓取符合 PubmedArticle 的文章
for i, paper in enumerate(papers['PubmedArticle']):
#如果欄位中有出現AuthorList才抓取
if "AuthorList" in paper['MedlineCitation']['Article']:
AuthorList = paper['MedlineCitation']['Article']['AuthorList']
for author_row in AuthorList:
Journal = PubDate = Abstract = Affiliation = name = mailAddr = ""
if "Journal" in paper['MedlineCitation']['Article']['Journal']:
Journal = paper['MedlineCitation']['Article']['Journal']
if "Year" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
PubDate += paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
if "Month" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
PubDate += " "+paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month']
if "Day" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
PubDate += " "+paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Day']
if "Abstract" in paper['MedlineCitation']['Article']:
Abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
#結合姓名
if "ForeName" in author_row:
name += author_row['ForeName']
if "LastName" in author_row:
name += " "+author_row['LastName']
for Affiliation_data in author_row['AffiliationInfo']:
Affiliation = Affiliation_data['Affiliation']
#尋找符合「@」跟至少一個「.」的Email
if Affiliation.find("@")>0 and Affiliation.find(".")>0 and name != "":
mailAddr_temp = re.search(r'[\w\.-]+@[\w\.-]+', Affiliation)
try:
#將Email最後一個「.」去除
mailAddr = mailAddr_temp.group()
if mailAddr.rsplit(".", 1)[1] == "":
mailAddr = mailAddr[:-1]
if paper['MedlineCitation']['PMID'] not in publication_exists:
publication_exists.append(paper['MedlineCitation']['PMID'])
#寫入 publication
data = [
paper['MedlineCitation']['PMID'],
Journal,
PubDate,
paper['MedlineCitation']['Article']['Journal']['Title'],
paper['MedlineCitation']['Article']['ArticleTitle'],
Abstract,
#keyword
]
worksheet_publication.append(data)
#寫入 publication owner
mail_data = [
paper['MedlineCitation']['PMID'],
mailAddr.lower(),
name,
Affiliation
]
worksheet_publication_owner.append(mail_data)
except:
continue
4. 最後就把它存成excel就完工囉
workbook.save(filename="/home/user/pubmed.xlsx")
相關連結