使用biopython抓取Pubmed作者Email資料

一、準備

    1. 安裝 pycharm(或 visual studio code)

    2. 安裝pip、biopython、openpyxl(本文章使用2.4.9版)等套件

    3. 到 NCBI網站 註冊一個帳號

二、建置環境

    1. 使用 pip 安裝 biopython 套件

sudo pip install biopython

    2. 使用安裝 pip 安裝 openpyxl 套件

sudo pip install openpyxl==2.4.9

三、開始來寫程式吧

    1. 打開你的編輯器,先加入會用到的函數

# -*- coding: utf-8 -*-
    from Bio import Entrez

    2. 再來寫一個class

#指定預設使用的 Email
pubmedEmail = "xxx@example.com"

#搜尋 PMID
    def search(query):
    	Entrez.email = pubmedEmail
    	handle = Entrez.esearch(
    		db='pubmed',         #指定要搜尋的資料庫
    		sort='relevance',
    		retmax='1000000', #回傳最大值
    		retmode='xml',       #回傳格式
    		term=query,             #搜尋的語句
    		api_key="xxxx"       #NCBI申請的api key
    	)
    	results = Entrez.read(handle)
    	return results
    
    #下載 PMID 資料
    def fetch_details(id_list):
        #抓出來的 pmid 使用 array
    	ids = ','.join(id_list)
    	Entrez.email = pubmedEmail
    	handle = Entrez.efetch(
    		db='pubmed',         #指定要搜尋的資料庫
    		retmode='xml',       #回傳格式
    		id=ids,
    		api_key="xxx"         #NCBI申請的api key
    	)
    	results = Entrez.read(handle)
    	return results

3. 然後就用class來抓取資料吧

if __name__ == '__main__':
    	# 新增 Excel檔
    	workbook = Workbook()
    	worksheet_prepare = workbook.active
    	worksheet_prepare.title = "Publication"
    	worksheet_publication.append(["Publication ID", "Author", "Publication date", "Journal", "Title", "Abstract", "Author info", "Research area", "Keyword"])
    
    	worksheet_publication_owner = workbook.create_sheet(title="Publication Owner")
    	worksheet_publication_owner.append(["Publication ID", "Email", "Name", "Country"])
    	
        #設定一個 query 語句
        query = '(hmgb1) AND ("2014/01/01"[Date - Publication] : "3000"[Date - Publication])'
        
        #抓取 PMID
    	results = search(query)
    	
    	#取出 PMID
    	id_list = results['IdList']
    	
    	#抓取符合 PubmedArticle 的文章
    	for i, paper in enumerate(papers['PubmedArticle']):
    	    #如果欄位中有出現AuthorList才抓取
    		if "AuthorList" in paper['MedlineCitation']['Article']:
    			AuthorList = paper['MedlineCitation']['Article']['AuthorList']
    			for author_row in AuthorList:
    				Journal = PubDate = Abstract = Affiliation = name = mailAddr = ""
    				if "Journal" in paper['MedlineCitation']['Article']['Journal']:
    					Journal = paper['MedlineCitation']['Article']['Journal']
    
    				if "Year" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
    					PubDate += paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
    
    				if "Month" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
    					PubDate += " "+paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month']
    
    				if "Day" in paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']:
    					PubDate += " "+paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Day']
    
    				if "Abstract" in paper['MedlineCitation']['Article']:
    					Abstract = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
                    
                    #結合姓名
    				if "ForeName" in author_row:
    					name += author_row['ForeName']
    
    				if "LastName" in author_row:
    					name += " "+author_row['LastName']
    
    				for Affiliation_data in  author_row['AffiliationInfo']:
    					Affiliation = Affiliation_data['Affiliation']
                        
                        #尋找符合「@」跟至少一個「.」的Email
    					if Affiliation.find("@")>0 and Affiliation.find(".")>0 and name != "":
    						mailAddr_temp = re.search(r'[\w\.-]+@[\w\.-]+', Affiliation)
    
    						try:
    						    #將Email最後一個「.」去除
    							mailAddr = mailAddr_temp.group()
    							if mailAddr.rsplit(".", 1)[1] == "":
    								mailAddr = mailAddr[:-1]
    								if paper['MedlineCitation']['PMID'] not in publication_exists:
    									publication_exists.append(paper['MedlineCitation']['PMID'])
    
    									#寫入 publication
    									data = [
    										paper['MedlineCitation']['PMID'],
    										Journal,
    										PubDate,
    										paper['MedlineCitation']['Article']['Journal']['Title'],
    										paper['MedlineCitation']['Article']['ArticleTitle'],
    										Abstract,
    										#keyword
    									]
    									worksheet_publication.append(data)
    
    								#寫入 publication owner
    								mail_data = [
    									paper['MedlineCitation']['PMID'],
    									mailAddr.lower(),
    									name,
    									Affiliation
    								]
    								worksheet_publication_owner.append(mail_data)
    						except:
    							continue

4. 最後就把它存成excel就完工囉

workbook.save(filename="/home/user/pubmed.xlsx")

相關連結

使用visual studio code執行python

Leave Comment

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *