【Python】 파이썬 유용 함수 모음

### Reference : https://changhsinlee.com/colab-import-python/

import requests

# Save datagenerators as file to colab working directory
# If you are using GitHub, make sure you get the "Raw" version of the code
url = 'https://github.com/JB243/nate9389/blob/main/Python/A_Collection_of_Useful_Functions_in_Python.py?raw=true'
r = requests.get(url)

# make sure your filename is the same as how you want to import 
with open('A_Collection_of_Useful_Functions_in_Python.py', 'w') as f:
    f.write(r.text)

# now we can import
from A_Collection_of_Useful_Functions_in_Python import SUM

### Warning
# Only SUM is implemented, but it seems to be implemented only at the beginning

SUM(2,6)

⑵ 아래에서 정의한 함수는 다음과 같이 사용할 수 있음 : 파이썬은 C언어처럼 함수를 정의 시 def를 사용

def SUM(a, b):
    return a+b

SUM(1,2)
# 3

2. 기초 함수 [목차]

⑴ 필수 암기 코드

① 타입 체크 : type(x)

② 길이 : len(x)

③ 차원 (e.g., numpy.ndarray, cv2.imread) : x.shape

④ 차원 (e.g., PIL.PngImagePlugin.PngImageFile, PIL.Image.open) : x.size

⑤ int로 형변환 : .astype(int)

⑥ 주어진 배열에서 고유한 원소들의 집합 : .unique()

⑦ x초만큼 대기 : time.sleep(x)

⑧ 리스트가 있을 때 index와 함께 for 문을 쉽게 생성하는 법 : for index, e in enumerate(my_list): ...

⑨ 주어진 문장을 '_' 단위로 쪼개고 그 중 앞 부분만을 취함 : myStr.split('_')[0]

⑵ 자연수 앞에 0을 채워서 총 다섯 자리가 되도록 하는 함수

n = 55
idx = str(n).zfill(5)

⑶ 주어진 문자열을 모두 소문자로 혹은 모두 대문자로

_str = "ABCdef"

_str.lower()
# 'abcdef'

_str.upper()
# 'ABCDEF'

⑷ 주어진 문자열에서 특정 문자열로 시작하는지 여부

a = "ABCD"

a.startswith('AB')
# TRUE

''' if a is not string object
a.str.startswith('AB')
# TRUE
'''

⑸ 주어진 문자열에서 특정 문자열로 끝나는지 여부

a = "ABCD"

a.endswith('CD')
# TRUE

⑹ 주어진 문자열(given_str)에서 특정 문자열(partial_str)을 포함하는지 여부

def is_contain(given_str, partial_str):    
    s = ''
    for char in given_str:
        s = s + char
        if s.endswith(partial_str):
            return True
    
    return False

⑺ 주어진 문자열(str)에서 앞 몇 개의 문자를 제거한 뒤 반환하는 함수

def str_omit_forward(_str, n):
    return _str[n:]

⑻ 주어진 문자열(str)에서 뒤 몇 개의 문자를 제거한 뒤 반환하는 함수

def str_omit_backward(_str, n):
    return _str[:-n]

⑼ 주어진 원소(e)가 리스트(l)에 포함되는지 여부

def is_element_in_list(e, l):
    for i in range(len(l)):
        if e == l[i]:
            return True
    return False

⑽ 주어진 원소(e)가 리스트(l) 내 몇 번째 원소인지를 출력하는 함수

def where_element_in_list(e, l):
    for i in range(len(l)):
        if e == l[i]:
            return i
    return -1

⑾ 두 리스트의 교집합을 출력하는 함수

def is_element_in_list(e, l):
    for i in range(len(l)):
        if e == l[i]:
            return True
    return False
    
def my_intersection(l1, l2):
    l = []
    flag = 0
    for i in range(len(l1)):
        if is_element_in_list(l1[i], l2):
            l[flag] = l1[i]
            flag = flag + 1

    return l

⑿ 날짜 표기 변환 함수 : 'Mar 02, 2018'을 20180301로 변환

def convert_date(_str):    
    def month_to_num(_str):
        if _str == 'Jan':
            return '01'
        if _str == 'Feb':
            return '02'
        if _str == 'Mar':
            return '03'
        if _str == 'Apr':
            return '04'
        if _str == 'May':
            return '05'
        if _str == 'Jun':
            return '06'
        if _str == 'Jul':
            return '07'
        if _str == 'Aug':
            return '08'
        if _str == 'Sep':
            return '09'
        if _str == 'Oct':
            return '10'
        if _str == 'Nov':
            return '11'
        if _str == 'Dec':
            return '12'    
    
    flag = 0
    s1 = '' # Mar
    s2 = '' # 01
    s3 = '' # 2018
    
    for char in _str:
        ## closing
        if char == ' ' and flag == 0:
            flag = 1
        elif char == ',' and flag == 1: 
            flag = 2
        elif char == ' ' and flag == 2:
            flag = 3
        ## propagation
        elif flag == 0:
            s1 = s1 + char
        elif flag == 1:
            s2 = s2 + char
        elif flag == 3:
            s3 = s3 + char
    
    return str(s3) + str(month_to_num(s1)) + str(s2)

⒀ 두 개의 리스트로 데이터프레임을 만드는 방법

import pandas as pd

v1 = []
v2 = []

...

df = pd.DataFrame(
    {'Column1': v1,
     'Column2': v2
    })

⒁ 주어진 함수 MyFunc의 내용물 보기

import inspect
lines = inspect.getsource(MyFunc)
print(lines)

⒂ 변수 이름을 문자열(string)로 받고, 그 값을 동적으로 활용

def set_variable_from_string(variable_name, value):
    globals()[variable_name] = value

def get_variable_value_from_string(variable_name):
    return globals().get(variable_name, None)

# 변수 이름을 문자열로 설정
var_name = "sample_variable"

# 문자열을 사용하여 변수 생성
set_variable_from_string(var_name, 12345)

# 문자열을 사용하여 변수 값을 가져옴
value = get_variable_value_from_string(var_name)
print(f"{var_name} = {value}")

3. 수학 함수 [목차]

⑴ 최대공약수 구하는 코드

def gcd(x: int, y: int) -> int:
  while y:
    x, y = y, x % y
  return x

4. 파일 입출력 [목차]

⑴ csv 파일 읽기

import pandas as pd

x = pd.read_csv("FILE.csv", header=None, names= ['barcodes','tissue','row','col','imgrow','imgcol'])

⑵ 텍스트 파일 읽기

import csv
import sys

def read_txt(txt_dir):
    l = []
    f = open(txt_dir, 'r', encoding = 'utf-8')
 
    for line in csv.reader(f):
        ### if you want elements of list ###
        l.append(line) 
		
        ### if you want elements of str ###
        # l.append(''.join(line)) 
    
    f.close()
    return l

⑶ 데이터 프레임으로 바꾼 뒤 .csv 파일로 쓰기 : pandas.DataFrame에서 정확한 행/열을 가져올 때 iloc을 사용

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
pandas.DataFrame(MY_VARIABLE).to_csv(FILE_PATH)

⑷ 디렉토리 내 모든 .csv 파일 읽기

import glob
print(glob.glob("./dir/*.csv"))

⑸ HDF (.h5) 파일 생성, 읽기, 그룹화 등

⑹ 웹사이트 크롤링(crawling) (레퍼런스)

import io
import requests
import pandas as pd

url = 'https://typeset.io/search?q=Microbiome'
github_session = requests.Session()
download = github_session.get(url).content

print(download)

⑺ pdf 파일 합치기 (ref)

from PyPDF2 import PdfWriter

merger = PdfWriter()

for pdf in ['pdf_1.pdf', 'pdf_2.pdf', 'pdf_3.pdf']:
    merger.append(pdf)

merger.write(target + ".pdf")
merger.close()

⑻ HTML to MarkDown

!pip install html2text
import html2text

def html_to_markdown(html_content):
    # Create an html2text converter object
    h = html2text.HTML2Text()
    h.ignore_links = False

    # Convert the HTML to Markdown
    markdown_content = h.handle(html_content)
    return markdown_content

# Example Usage
html_code = """
<h1>Hello World</h1>
<p>This is a paragraph. <a href="https://example.com">Here is a link</a>.</p>
"""

markdown_code = html_to_markdown(html_code)
print(markdown_code)

⑼ MarkDown to HTML

import markdown

def md_to_html(md_file_path, html_file_path):
    """
    Convert a Markdown file to an HTML file.

    :param md_file_path: Path to the input Markdown file.
    :param html_file_path: Path to the output HTML file.
    """
    # Read the Markdown file
    with open(md_file_path, 'r', encoding='utf-8') as file:
        md_content = file.read()

    # Convert Markdown to HTML
    html_content = markdown.markdown(md_content)

    # Write the HTML content to a file
    with open(html_file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

# Example usage
md_to_html('example.md', 'example.html')

⑽ 주어진 폴더 내 모든 PDF 파일 합치기 (가나다 순)

import os
import glob
import fitz  # PyMuPDF library

def merge_pdfs(input_folder, output_path):
    # input_folder: PDF 파일들이 있는 디렉토리
    # output_path: 결과물 PDF 파일의 경로

    # 입력 폴더에서 PDF 파일 목록 가져오기
    pdf_files = glob.glob(os.path.join(input_folder, '*.pdf'))

    print(pdf_files)
    
    # 가나다 순으로 정렬
    pdf_files.sort()

    # PyMuPDF의 PdfWriter 객체 생성
    pdf_writer = fitz.open()

    # 모든 PDF 파일을 합병
    for pdf_file in pdf_files:
        pdf_document = fitz.open(pdf_file)
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            pdf_writer.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)

    # 결과물 PDF 파일 저장
    pdf_writer.save(output_path)
    pdf_writer.close()

    # 파일 합병 완료 메시지 출력
    print(f'PDF 파일이 {output_path}로 합병되었습니다.')

# 입력 폴더와 결과물 파일 경로 설정
input_folder = r'D:/1'
output_path = r'D:/output.pdf'

# PDF 파일 합병 실행
merge_pdfs(input_folder, output_path)

⑾ 특정 html 문서에서 모든 다운로드 링크를 수집한 뒤 파일 다운로드

import requests
from bs4 import BeautifulSoup
import os

def download_file(url, local_filename):
    """Function to download a file from a given URL."""
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

# Read the modified HTML file
with open('modified_html_file.html', 'r') as file:
    html_content = file.read()

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Find all 'a' tags with href attributes
links = soup.find_all('a', href=True)

# Folder to save files
save_folder = 'out' ## You must make this folder
os.makedirs(save_folder, exist_ok=True)

# Download each file
for link in links:
    url = link['href']
    filename = os.path.join(save_folder, url.split('/')[-1])
    print(f"Downloading {url}...")
    download_file(url, filename)

print("Download completed.")

⑿ GIF 파일을 여러 PNG 파일로 쪼개주는 코드

from PIL import Image
import os

def gif_to_png(gif_filename, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the GIF file
    with Image.open(gif_filename) as img:
        # Loop over each frame in the GIF
        for frame in range(img.n_frames):
            # Select the frame
            img.seek(frame)

            # Save the frame as a PNG file
            frame_filename = f"{output_folder}/frame_{frame}.png"
            img.save(frame_filename)

            print(f"Saved {frame_filename}")

# Example usage
gif_filename = 'img.gif'  # Replace with your GIF file path
output_folder = 'out'  # Folder where PNG files will be saved
gif_to_png(gif_filename, output_folder)

⒀ 디렉토리 트리 구조를 출력하는 코드

import os

def print_tree(directory, prefix=''):
    """Recursively prints a tree diagram of the directory structure."""
    files = sorted(os.listdir(directory))
    for i, file in enumerate(files):
        path = os.path.join(directory, file)
        is_last = i == (len(files) - 1)
        print(f"{prefix}{'└── ' if is_last else '├── '}{file}")
        if os.path.isdir(path):
            # Add custom annotations here if file/directory matches certain criteria
            new_prefix = prefix + ('    ' if is_last else '│   ')
            print_tree(path, new_prefix)

# Replace '/path/to/directory' with the actual directory path you want to visualize
print_tree('/path/to/directory')

⒁ PDF 파일을 각 페이지별 PDF 파일들로 쪼개기

import PyPDF2

def split_pdf_pages(input_pdf_path, output_folder):
    # PDF 파일 열기
    with open(input_pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        # 각 페이지를 독립적인 PDF 파일로 저장
        for i in range(num_pages):
            writer = PyPDF2.PdfWriter()
            writer.add_page(reader.pages[i])
            
            output_filename = f"{output_folder}/page_{i+1}.pdf"
            with open(output_filename, "wb") as output_file:
                writer.write(output_file)
                
            print(f"Saved: {output_filename}")

# 함수 사용 예시
split_pdf_pages("path_to_your_input.pdf", "output_directory")

5. 자료구조 [목차]

⑴ 개요

① 배열

② 연결된 리스트

③ 스택

④ 큐

⑤ 트리

⑥ 그래프

⑦ 해시

⑧ 셋

⑵ counting island 알고리즘 : M × N 리스트인 world에서 1을 땅, 0을 물이라 할 때, 섬의 개수를 세는 함수

def counting_island(world: list)->int:
    class Node():
        def __init__(self, i, j): 
            self.i = i
            self.j = j
    
    class undirected_graph():     
        def __init__(self, V:list, E:list)->None:
            self.V = V[:]
            self.neighbor = {}
            for v in V:
                self.neighbor[v] = []
            for (v,w) in E:
                self.neighbor[v].append(w)

        def DFT_preorder(self)->int:
            count = 0
            if self.V:
                visited = {}
                for v in self.V:
                    visited[v]=False
                for v in self.V:
                    if not visited[v]: 
                        count += 1
                        self.__DFT__preorderHelp(visited, v)
            return count
                        
        def __DFT__preorderHelp(self, visited: list, v: int)->None:
            if not visited[v]:
                visited[v] = True
                for w in self.neighbor[v]:
                    self.__DFT__preorderHelp(visited, w)

    
    V = []
    E = []
    
    for i in range(len(world)):
        for j in range(len(world[0])):
            if world[i][j] == 1:
                V.append(Node(i, j))

    for v in V:
        for w in V:
            if w.i == v.i and w.j == v.j - 1:
                E.append((v,w))
            if w.i == v.i and w.j == v.j + 1:
                E.append((v,w))
            if w.i == v.i - 1 and w.j == v.j:
                E.append((v,w))
            if w.i == v.i + 1 and w.j == v.j:
                E.append((v,w))
    
    g = undirected_graph(V, E)
    
    return g.DFT_preorder()


world1 = [[1,1,1,1,0], [1,0,0,1,0], [1,1,0,1,0], [1,1,0,0,0]]
counting_island(world1) # 1

world2 = [[1,1,0,0,0], [1,1,0,0,0], [0,0,1,1,0], [0,0,0,0,1]]
counting_island(world2) # 3

world3 = [[0,1,0,1,0,1]]
counting_island(world3) # 3

6. 이미지 처리 [목차]

⑴ 기본적인 그래프 생성 코드

① x, y가 주어졌을 때 scatter plot을 그리는 함수

plt.scatter(x, y, alpha = 0.01)
plt.xlabel('brightness of img1')
plt.ylabel('brightness of img2')

② 주어진 이미지의 히스토그램을 그리는 함수

import numpy as np
import cv2
import matplotlib.pyplot as plt

img = cv2.imread('image.png')

fig, axes = plt.subplots(1, 1)
axes.hist(img.ravel(), bins=20)
axes.set_title('Histogram')

③ 4-parameter logistic (4PL) curve를 그리는 코드 : 약리학의 drug-response curve에서 많이 사용

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# 4PL 모델 정의
def four_param_logistic(x, A, B, C, D):
    return A + (B - A) / (1 + (x / C)**D)

# 예제 데이터
x_data = np.linspace(1, 100, 100)
y_data = four_param_logistic(x_data, 0.5, 10, 50, 1.5) + np.random.normal(size=len(x_data), scale=0.5)

# 파라미터 추정
popt, pcov = curve_fit(four_param_logistic, x_data, y_data, p0=[1, 1, 1, 1])

# 추정된 파라미터로 모델 예측
x_fit = np.linspace(1, 100, 400)
y_fit = four_param_logistic(x_fit, *popt)

# 결과 플로팅
plt.figure(figsize=(10, 5))
plt.scatter(x_data, y_data, label='Data')
plt.plot(x_fit, y_fit, color='red', label='4PL fit')
plt.title('4-Parameter Logistic Fit')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

⑵ 이미지 저장

① 이미지를 png로 저장하는 예시 (ref)

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(5, 5))
fig.patch.set_visible(False)  
plt.imshow(demask_image_t[0], interpolation='nearest')
plt.axis('off')
plt.savefig('foo.png', bbox_inches='tight', dpi = 300)

② 이미지를 pdf로 저장하는 예시

...
plt.show()
fig.savefig('foo.pdf', bbox_inches='tight', dpi = 300)

⑶ cv2 이미지 RGB 컬러 재배열

import cv2
d = cv2.imread("/home/data/ST/WuSZ_2021_NG_TNBC/GSE/1160920F_spatial/spatial/tissue_hires_image.png")
d_array = cv2.cvtColor(d, cv2.COLOR_BGR2RGB)
plt.imshow(d_array)

⑷ cv2 이미지 resize : 예를 들어 img가 (1024, 1024, 3)이라면, img_re는 (512, 512, 3)이 됨

img_re = cv2.resize(img, dsize = (512, 512))

⑸ RGB 이미지를 흑백 이미지로 변환

## version 1.

img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)


## version 2

import numpy as np

def RGBtoGray(img):
    out = np.empty((img.shape[0], img.shape[1]))

    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            out[i:i+1, j:j+1] = ( 0.2989*img[i:i+1, j:j+1, 0:1] + 
                                  0.5870*img[i:i+1, j:j+1, 1:2] + 
                                  0.1140*img[i:i+1, j:j+1, 2:3]
                                ).item()
      
    return out

⑹ 3-channel RGB 이미지에서 red image(1 × W × H)를 2차원 이미지(W × H)로 만드는 함수

img_ = img[:,:,0:1].reshape(img.shape[0], img.shape[1])

⑺ 이미지에서 특정 값을 다른 값으로 바꾸는 함수

def recolor(img, pre, post): 
    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            if img[i:i+1, j:j+1] == pre:
                img[i:i+1, j:j+1] = post
    return(img)

⑻ 정수형 numpy.ndarray 변수를 실수형 numpy.ndarray 변수로 바꾸기 (레퍼런스)

import cv2
import numpy as np
import matplotlib.pyplot as plt

img = cv2.imread('img.png')
img_cvt = np.array(img[:,:,:], dtype=np.float64)

⑼ 한 이미지에서 red와 green의 상관관계

import cv2
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import numpy as np
import scipy.stats

def two_image_correlation_RG(img1_dir):

    img1 = cv2.imread(img1_dir) # RGB image
    img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)

    l_img1 = []
    l_img2 = []

    for i in range(img1.shape[0]):
        for j in range(img1.shape[1]):
            l_img1.append((
                img1[i:i+1, j:j+1, 0:1] 
            ).item())

            l_img2.append((
                img1[i:i+1, j:j+1, 1:2] 
            ).item())

    print("brightness of img1")
    print(np.mean(l_img1))
    print("brightness of img2")
    print(np.mean(l_img2))

    print("img1-img2 correlation")
    print(scipy.stats.pearsonr(l_img1, l_img2) )

    plt.scatter(l_img1, l_img2, alpha = 0.01)
    plt.xlabel('brightness of img1')
    plt.ylabel('brightness of img2')

⑽ 이미지 간 상관관계

import cv2
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import numpy as np
import scipy.stats

def two_image_correlation(img1_dir, img2_dir):
	
    # img1 and img2 should be same in size
    img1 = cv2.imread(img1_dir) # RGB image
    img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    img2 = cv2.imread(img2_dir) # RGB image
    img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
		
    l_img1 = []
    l_img2 = []
		
    for i in range(img1.shape[0]):
        for j in range(img1.shape[1]):
            l_img1.append((
                0.2989*img1[i:i+1, j:j+1, 0:1] + 
                0.5870*img1[i:i+1, j:j+1, 1:2] + 
                0.1140*img1[i:i+1, j:j+1, 2:3]
            ).item())
		        
            l_img2.append((
                0.2989*img2[i:i+1, j:j+1, 0:1] + 
                0.5870*img2[i:i+1, j:j+1, 1:2] + 
                0.1140*img2[i:i+1, j:j+1, 2:3]
            ).item())
		
    print("brightness of img1")
    print(np.mean(l_img1))
    print("brightness of img2")
    print(np.mean(l_img2))
		
    print("img1-img2 correlation")
    print(scipy.stats.pearsonr(l_img1, l_img2) )
		
    plt.scatter(l_img1, l_img2, alpha = 0.05)
    plt.xlabel('brightness of img1')
    plt.ylabel('brightness of img2')
    # sns.regplot(l_b_CD31, l_b_Lipo, alpha = 0.05)

⑾ 한 3차원 이미지에서 red와 green의 상관관계

import cv2
import numpy as np
import matplotlib.pyplot as plt
from skimage.util import view_as_windows
import scipy.stats
import glob

def two_image_correlation_RG_3D(my_dir):

    images = glob.glob(my_dir + "*.jpg")
    
    l_img1 = []
    l_img2 = []

    for k in range(len(images)):
        print(k)
        img1 = cv2.imread(images[k]) 
        img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
        for i in range(img1.shape[0]):
            for j in range(img1.shape[1]):
                if (img1[i:i+1, j:j+1, 0:1] + img1[i:i+1, j:j+1, 1:2] + img1[i:i+1, j:j+1, 2:3]) != 0:
                    l_img1.append((
                        img1[i:i+1, j:j+1, 0:1] 
                    ).item())

                    l_img2.append((
                        img1[i:i+1, j:j+1, 1:2] 
                    ).item())

    print("brightness of img1")
    print(np.mean(l_img1))
    print("brightness of img2")
    print(np.mean(l_img2))

    print("img1-img2 correlation")
    print(scipy.stats.pearsonr(l_img1, l_img2) )

    plt.scatter(l_img1, l_img2, alpha = 0.01)
    plt.xlabel('brightness of img1')
    plt.ylabel('brightness of img2')

⑿ 한 쌍의 이미지의 SSIM 값을 구하는 코드

## Method 1

def SSIM(x, y):
    # assumption : x and y are grayscale images with the same dimension

    import numpy as np
    
    def mean(img):
        return np.mean(img)
        
    def sigma(img):
        return np.std(img)
    
    def cov(img1, img2):
        img1_ = np.array(img1[:,:], dtype=np.float64)
        img2_ = np.array(img2[:,:], dtype=np.float64)
                        
        return np.mean(img1_ * img2_) - mean(img1) * mean(img2)
    
    K1 = 0.01
    K2 = 0.03
    L = 256 # when each pixel spans 0 to 255
   
    C1 = K1 * K1 * L * L
    C2 = K2 * K2 * L * L
    C3 = C2 / 2
        
    l = (2 * mean(x) * mean(y) + C1) / (mean(x)**2 + mean(y)**2 + C1)
    c = (2 * sigma(x) * sigma(y) + C2) / (sigma(x)**2 + sigma(y)**2 + C2)
    s = (cov(x, y) + C3) / (sigma(x) * sigma(y) + C3)
        
    return l * c * s
    
    
    
 ## Method 2 
 
from skimage.metrics import structural_similarity as ssim
import cv2

x = cv2.imread('x.png', cv2.IMREAD_GRAYSCALE)
y = cv2.imread('y.png', cv2.IMREAD_GRAYSCALE)

score, diff = ssim(x, y, full=True)
print("SSIM:", score)

⒀ 한 쌍의 이미지의 mutual information 값을 구하는 코드 (레퍼런스)

def mutual_information(img1, img2):
    import numpy as np
    import cv2
    import matplotlib.pyplot as plt
    
    # img1 and img2 are 3-channel color images
    
    a = img1[:,:,0:1].reshape(img1.shape[0], img1.shape[1])
    b = img2[:,:,0:1].reshape(img2.shape[0], img2.shape[1])
    
    hgram, x_edges, y_edges = np.histogram2d(
     a.ravel(),
     b.ravel(),
     bins=20
    )

    pxy = hgram / float(np.sum(hgram))
    px = np.sum(pxy, axis=1) # marginal for x over y
    py = np.sum(pxy, axis=0) # marginal for y over x
    px_py = px[:, None] * py[None, :] # Broadcast to multiply marginals

    nzs = pxy > 0 # Only non-zero pxy values contribute to the sum
    
    return np.sum(pxy[nzs] * np.log(pxy[nzs] / px_py[nzs]))

⒁ 회귀곡선을 구하는 코드

from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit([[0,0], [1,1], [2,2]], [0, 1, 2])
# LinearRegression()
reg.coef_
# array([0.5, 0.5])

⒂ 공간전사체(ST)에서 배경 이미지, x 좌표, y 좌표, 컬러 c가 주어졌을 때 spot mapping image 생성 (레퍼런스)

def spatial_featuremap(img, x, y, c):
    tsimg = np.zeros(img.shape[:2])    
    tsimg_row = y # np.array형 변수
    tsimg_col = x # np.array형 변수
    for rr, cc, t in zip(tsimg_row, tsimg_col, c):
        r, c = draw.circle(rr, cc, radius = 2.5) 
        tsimg[r,c]= t
    return tsimg

⒃ 이미지의 양각(embossing), 음각(intaglio)을 판단하는 함수 : return 값 (-1 ~ 1)이 양수이면 양각, 음수이면 음각

def is_embossing(image):
    l_1 = []
    l_2 = []

    row_cm = image.shape[0]/2
    col_cm = image.shape[1]/2

    def distance(x:int, y:int):
        import numpy as np
        return np.sqrt(x*x + y*y)
    
    def my_sqrt(x:int, y:int, z:int):
        import numpy as np
        return np.sqrt(x*x + y*y + z*z)

    for i in range(image.shape[0]):
        for j in range(image.shape[1]):
            l_1.append( distance(i - row_cm, j - col_cm ))

            if len(image.shape) == 2: # grayscale
                l_2.append( image[i:i+1, j:j+1].item() )
            elif len(image.shape) == 3: # 3-channel color image
                l_2.append( 
                    my_sqrt( image[i:i+1, j:j+1, 0:1].item(),
                             image[i:i+1, j:j+1, 1:2].item(),
                             image[i:i+1, j:j+1, 2:3].item()
                           )
                )
   
    print("Distance from Center")
    print(np.mean(l_1))
    print("Pixel Intensity")
    print(np.mean(l_2))

    plt.scatter(l_1, l_2, alpha = 0.01)
    plt.ylim(0, 255)
    plt.xlabel('Distance from Center')
    plt.ylabel('Pixel Intensity')
    
    return scipy.stats.pearsonr(l_1, l_2)[0] * (-1)

⒄ 여러 이미지를 합쳐서 하나의 이미지로 표현하는 유용한 코드

fig, axes = plt.subplots(2,2,figsize=(6,20))

sns.barplot(df1[0:20], x='Gene', y ='Enrichment', ax = axes[0][0])
axes[0][0].tick_params(axis='x', rotation=90)
axes[0][0].set_title('DataFrame 1')

sns.barplot(df2[0:20], x='Gene', y ='Enrichment', ax = axes[0][1])
axes[0][1].tick_params(axis='x', rotation=90)
axes[0][1].set_title('DataFrame 2')

sns.barplot(df3[0:20], x='Gene', y ='Enrichment', ax = axes[1][0])
axes[1][0].tick_params(axis='x', rotation=90)
axes[1][0].set_title('DataFrame 3')

sns.barplot(df4[0:20], x='Gene', y ='Enrichment', ax = axes[1][1])
axes[1][1].tick_params(axis='x', rotation=90)
axes[1][1].set_title('DataFrame 4')

plt.tight_layout()
plt.show()

⒅ 이미지의 base64 코드가 잘린 경우 부분 복호화하는 코드

import base64
from PIL import Image, ImageFile
import io

# if the data is <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA+0AA..."
data = "iVBORw0KGgoAAAANSUhEUgAAA+0AA..." # you need to modify this part

# If the length of the data is 'num' more than a multiple of 4, trim the last character(s)
num = len(data) % 4
data = data[:-num]

# Add padding to ensure the length is a multiple of 4
padding = '=' * ((4 - len(data) % 4) % 4)
data = data + padding

try:
    decoded_data = base64.b64decode(data)
    # Process the decoded data as needed
except Exception as e:
    print("Error in decoding base64:", e)
    
image = Image.open(io.BytesIO(decoded_data))

ImageFile.LOAD_TRUNCATED_IMAGES = True

try:
    image = Image.open(io.BytesIO(decoded_data))
    image.show()
except IOError:
    print("Cannot open the image.")

⒆ chart-to-text (ref)

⒇ 주어진 이미지와 동일한 사이즈의 random image 생성하기

import numpy as np
import cv2

# Load the image using OpenCV
tsimg = cv2.imread(imgfile)

# Get the dimensions of the original image
height, width, channels = tsimg.shape
    
# Create a random image with the same dimensions
# Random values are generated in the range 0-255, which is the range for pixel values in an 8-bit image
random_img = np.random.randint(0, 256, (height, width, channels), dtype=np.uint8)
    
# Optionally, you can save or display the random image using OpenCV
cv2.imwrite('random_image.png', random_img)  # Save the random image

⒇ 이미지 내 글씨 폰트 설치 및 변경

# Method 1
import matplotlib.font_manager as font_manager

fonts = font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
available_fonts = [font_manager.FontProperties(fname=font).get_name() for font in fonts]
print('Droid Sans Fallback' in available_fonts)
plt.rcParams['font.family'] = 'Droid Sans Fallback'


# Method 2
import matplotlib.font_manager as fm

# List available fonts
available_fonts = fm.findSystemFonts(fontpaths=None, fontext='ttf')

# Print the list of available fonts
for font in available_fonts:
    print(font)
 
 
# Method 3
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager

# 폰트 파일 경로 지정 (예시: 운영체제 및 설치된 폰트 위치에 따라 다름)
font_path = 'NanumGothic.ttf'

# 폰트 속성 설정
font_prop = font_manager.FontProperties(fname=font_path)

# Matplotlib에 폰트 속성 적용
plt.rcParams['font.family'] = font_prop.get_name()

# 플롯 생성 예시
plt.figure()
plt.text(0.5, 0.5, '한글 표시 테스트', fontproperties=font_prop, fontsize=12)
plt.show()

⒇ Otsu thresholding method

def otsu_thresholding(image_path):
    import cv2
    import matplotlib.pyplot as plt

    # Load the image from the specified path
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply Gaussian blur to the grayscale image
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

    # Perform Otsu's thresholding to obtain the initial thresholded image
    _, initial_otsu_thresholded = cv2.threshold(blurred_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Modify the threshold value if needed
    threshold_adjustment = 5
    modified_threshold_value = _ + threshold_adjustment
    _, modified_otsu_thresholded = cv2.threshold(blurred_image, modified_threshold_value, 255, cv2.THRESH_BINARY)

    # Display the original grayscale, initial Otsu thresholded, and modified Otsu thresholded images
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    plt.imshow(gray_image, cmap='gray')
    plt.title('Grayscale Image')
    plt.axis('off')

    plt.subplot(1, 3, 2)
    plt.imshow(initial_otsu_thresholded, cmap='gray')
    plt.title('Initial Otsu Thresholded Image')
    plt.axis('off')

    plt.subplot(1, 3, 3)
    plt.imshow(modified_otsu_thresholded, cmap='gray')
    plt.title('Modified Otsu Thresholded Image')
    plt.axis('off')

    plt.show()

    # Return the modified Otsu thresholded image
    return modified_otsu_thresholded

⒇ 인터넷 상의 이미지 시각화

import matplotlib.pyplot as plt
import requests
from PIL import Image
from io import BytesIO

# 이미지 URL
image_url = "https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdna%2FmxJpC%2FbtsrErXwO2Q%2FAAAAAAAAAAAAAAAAAAAAAF9-ECVGB6u2T5uk0v9lVZUzhIwFznPelnx8iRmYpiVT%2Fimg.png%3Fcredential%3DyqXZFxpELC7KVnFOS48ylbz2pIh7yKj8%26expires%3D1753973999%26allow_ip%3D%26allow_referer%3D%26signature%3DXpLrrI4NLtT03t%252FSKMv8Rxp60K8%253D"

# 이미지 로드
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))

# 이미지 시각화
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.axis('off')  # 축 제거
plt.show()

7. 데이터 사이언스 [목차]

⑴ K means clustering

from sklearn.cluster import KMeans

X = [[1, 2, 3],[1, 2, 3.1],[2, 2, 4],[2, 2, 4.1]]
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(X)
cluster_labels = kmeans.labels_

print(cluster_labels)
# [1 1 0 0]

⑵ 수열이 주어졌을 때 지니계수를 구하기

def gini_coefficient(values):
    # 값의 리스트가 비어있거나 모든 값이 동일하면 지니계수는 0입니다.
    if not values or all(x == values[0] for x in values):
        return 0.0
    
    n = len(values)
    mean_value = sum(values) / n
    sum_of_differences = sum(abs(x - y) for x in values for y in values)

    # 지니 계수 계산
    gini = sum_of_differences / (2 * n**2 * mean_value)
    return gini

# 예제 데이터
income = [15, 20, 35, 40, 50]

# 지니 계수 계산
gini_index = gini_coefficient(income)
print(f"The Gini coefficient is {gini_index:.2f}")

⑶ 수열이 주어져 있을 때 Shannon entropy 구하기

import numpy as np

def shannon_index(data):
    # 데이터에서 고유 값과 각 값의 개수를 세기
    values, counts = np.unique(data, return_counts=True)
    # 개수를 사용하여 비율 계산
    proportions = counts / sum(counts)
    # 비율이 0이 아닌 경우에 대해 Shannon index 계산
    return -np.sum(proportions * np.log(proportions))

# 예제 데이터, 여기서 1이 3개, 2가 2개, 3이 1개 있음
data = [1, 1, 1, 2, 2, 3]

# Shannon index 계산
index = shannon_index(data)
print(f"Shannon index: {index}")

8. 딥러닝 함수 [목차]

⑴ 딥러닝 예제 : MNIST (즉, 손으로 쓴 숫자들) 데이터에 딥러닝 모델 적용하기

import tensorflow as tf
from tensorflow.keras import layers, models

# Load the MNIST dataset
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the pixel values of the images to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

# Define a simple sequential model
def create_model():
    model = models.Sequential([
        layers.Flatten(input_shape=(28, 28)),  # Flatten the 2D image to a 1D array
        layers.Dense(128, activation='relu'),  # First hidden layer with 128 neurons and ReLU activation
        layers.Dropout(0.2),                   # Dropout layer to reduce overfitting
        layers.Dense(10, activation='softmax')  # Output layer with 10 neurons for 10 classes and softmax activation
    ])
    return model

# Create the model
model = create_model()

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_images, train_labels, epochs=5)

# Evaluate the model
test_loss, test_acc = model.evaluate(test_images, test_labels)

print(f'Test accuracy: {test_acc}')

⑵ 6만 개의 데이터 Xtr, 6만 개의 레이블 Ytr이 있을 때, 1만 개의 Xte를 KNN으로 분류하는 알고리즘

import tensorflow as tf

(Xtr, Ytr), (Xte, Yte) = tf.keras.datasets.mnist.load_data()

print(Xtr.shape) # (60000, 28, 28)
print(Ytr.shape) # (60000,)
print(Xte.shape) # (10000, 28, 28)
print(Yte.shape) # (10000,)

Xtr_rows = Xtr.reshape(Xtr.shape[0], 28*28)
Xte_rows = Xte.reshape(Xte.shape[0], 28*28)

print(Xtr_rows.shape) # (60000, 784)
print(Xte_rows.shape) # (10000, 784)

def KNN_predict(Xtr_rows, Ytr, Xte_rows, dist_metric='L2'):
    import numpy as np
    
    class NearestNeighbor(object):
        def __init__(self):
            pass

        def train(self, X, Y):
            self.Xtr = X
            self.Ytr = Y

        def predict(self, X, dist_metric=dist_metric):
            num_test = X.shape[0]
            Ypred = np.zeros(num_test, dtype = self.Ytr.dtype)
        
            for i in range(num_test):
                if dist_metric=='L1': ## L1 distance
                    distances = np.sum(np.abs(self.Xtr - X[i, :]), axis = 1) 
                elif dist_metric=='L2': ## L2 distance
                    distances = np.sum(np.square(self.Xtr - X[i, :]), axis = 1) 
                elif dist_metric=='dot': ## dot distance
                    distances = np.dot(self.Xtr, X[i, :])
            
                min_index = np.argmin(distances)
                Ypred[i] = self.Ytr[min_index]
            
                if i%100 == 0:
                    print(i)
            return Ypred
        
    nn = NearestNeighbor()
    nn.train(Xtr_rows, Ytr)
    
    Yte_predict = nn.predict(Xte_rows, dist_metric)
    return Yte_predict


def KNN_evaluate(Xtr_rows, Ytr, Xte_rows, Yte, dist_metric='L2'):
    Yte_predict = KNN_predict(Xtr_rows, Ytr, Xte_rows, dist_metric)
    print ('accuracy: %f' + str(np.mean(Yte_predict == Yte)) )
    # L1 : accuracy = 47.29%
    # L2 : accuracy = 27.24%
    # dot : accuracy = 9.9%

KNN_evaluate(Xtr_rows, Ytr, Xte_rows, Yte)

⑶ 1D CNN으로 구현한 binary classifier 함수

# reference : https://wikidocs.net/80783

def binary_predict_by_1D_CNN (Xtr_rows, Ytr, Xte_rows, Yte):
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
    from tensorflow.keras.models import load_model
    
    vocab_size = 10000
    embedding_dim = 256 # 임베딩 벡터의 차원
    dropout_ratio = 0.3 # 드롭아웃 비율
    num_filters = 256 # 커널의 수
    kernel_size = 3 # 커널의 크기
    hidden_units = 128 # 뉴런의 수

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim))
    model.add(Dropout(dropout_ratio))
    model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_units, activation='relu'))
    model.add(Dropout(dropout_ratio))
    model.add(Dense(1, activation='sigmoid'))
    
    model.summary()
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    history = model.fit(Xtr_rows, Ytr, epochs=20, validation_data=(Xte_rows, Yte), callbacks=[es, mc])    

def binary_evaluate_by_1D_CNN(Xtr_rows, Ytr, Xte_rows, Yte):
    from tensorflow.keras.models import load_model
    binary_predict_by_1D_CNN(Xtr_rows, Ytr, Xte_rows, Yte)
    loaded_model = load_model('best_model.h5')
    print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(Xte_rows, Yte)[1]))


##################
# Example 1. Mnist

import tensorflow as tf

(Xtr, Ytr), (Xte, Yte) = tf.keras.datasets.mnist.load_data()
Xtr_rows = Xtr.reshape(Xtr.shape[0], 28*28)  
Xte_rows = Xte.reshape(Xte.shape[0], 28*28)

print(Xtr_rows.shape) # (60000, 784)
print(Ytr.shape) # (60000,)
print(Xte_rows.shape) # (10000, 784)
print(Yte.shape) # (10000,)

binary_evaluate_by_1D_CNN(Xtr_rows, Ytr > 5, Xte_rows, Yte > 5)
# 1D CNN accuracy : 7.82%


##################
# Example 2. IMDB

from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)
max_len = 200
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

binary_evaluate_by_1D_CNN(X_train, y_train, X_test, y_test)
# 1D CNN accuracy : 88.92%

⑷ 임의의 이미지로부터 pre-trained CNN을 통해 512차원 feature를 추출하는 함수

# reference
## https://github.com/mexchy1000/spade/blob/master/spade_spatial_marker_by_deeplearning.py

import os
import numpy as np
import matplotlib.pyplot as plt
from skimage import draw
import pandas as pd
import argparse
from keras import backend as K
from keras.applications import vgg16, resnet50, inception_v3
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

K.set_image_data_format='channels_last'
os.environ['KERAS_BACKEND'] = 'tensorflow'

def features_512D_from_image_by_CNN (img, model_name = 'VGG16'):
    # Image Patch
    img_re = cv2.resize(img, dsize = (32, 32))
    tspatches = []
    tspatches.append(img_re)
    tspatches.append(img_re) # intentional duplicate
    tspatches = np.asarray(tspatches)    
    
    # Pretrained model
    if model_name == 'VGG16':
        pretrained_model = vgg16.VGG16(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
        X_in = tspatches.copy()
        X_in = vgg16.preprocess_input(X_in)
    elif model_name == 'ResNet50':
    	pretrained_model = resnet50.ResNet50(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
        X_in = tspatches.copy()
        X_in = resnet50.preprocess_input(X_in)
    elif model_name == 'InceptionV3':
        # 아래 코드에서 input_shape를 75 이상으로 늘리지 않으면 에러가 남
        pretrained_model = inception_v3.InceptionV3(weights='imagenet', include_top = False, pooling='avg', input_shape = (32,32,3))
        X_in = tspatches.copy()
        X_in = inception_v3.preprocess_input(X_in)
    
    pretrained_model.trainable = False
    pretrained_model.summary()
    
    
    # Get the features
    ts_features = pretrained_model.predict(X_in)
    
    return ts_features[0]

⑸ BERT 혹은 BioBERT를 Hugging Face에서 불러와서 주어진 문장에 대한 attention matrix를 만드는 함수

import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import seaborn as sns

# BERT 모델과 토크나이저 불러오기 (OPTION 1)
'''
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_attentions=True)
'''

# BioBERT 모델과 토크나이저 불러오기 (OPTION 2)
model_name = 'dmis-lab/biobert-base-cased-v1.1'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_attentions=True)

# 입력 문장
sentence = "Find diseases associated with glucose"

# 토큰화
inputs = tokenizer(sentence, return_tensors='pt')

# 모델을 통해 출력값과 attention 가중치 계산
outputs = model(**inputs)
attentions = outputs.attentions  # 이 값이 각 층의 attention 가중치

# 첫 번째 층의 첫 번째 헤드의 attention 가중치 시각화
attention = attentions[0][0][0].detach().numpy()

# 토큰 리스트
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# Attention 가중치 시각화
plt.figure(figsize=(10, 10))
sns.heatmap(attention, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
plt.title('Attention Weights')
plt.show()

⑹ 임의의 가변 길이 자연어 문장을 그 의미를 고려하여 384차원으로 만드는 함수 (cf. CELLama)

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import torch
from torch.utils.data import DataLoader, TensorDataset
from xgboost import XGBClassifier

def sentences_to_embedding(sentences):
    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = embedding_function.embed_documents(sentences)
    emb_res = np.asarray(db)
    return emb_res


sentences = []
sentences.append("What is the meaning of: obsolete")
sentences.append("What is the meaning of: old-fashioned")
sentences.append("What is the meaning of: demagogue")
emb_res = sentences_to_embedding(sentences)

⑺ 자연어 처리 및 LLM 유용 함수모음

9. 예외 처리 [목차]

⑴ 기본적인 예외 처리 구문

try:
    raise KeyError("Error")
except:
    print("An exception occurred")

# An exception occurred

⑵ error 메시지 출력 (ref.)

try:
	print(1/0)
except Exception as e:
	print(e)

10. 생물정보학 [목차]

⑴ highly variable gene을 구하는 코드

import scanpy as sc

tissue_dir = './outs'
adata1 = sc.read_visium(tissue_dir)

sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 3

adata1.var_names_make_unique()
sc.pp.normalize_total(adata1, inplace=True)
sc.pp.log1p(adata1)
sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=200)

gene_list=adata1.var.loc[adata1.var.highly_variable==True,:].index

⑵ highest expressing gene을 구하는 코드

import scanpy as sc
import numpy as np
import pandas as pd

pbmc = sc.read_h5ad("Tabula_Sapiens.h5ad")

# normalize counts matrix so that each 'cell' (barcode) has counts summing to 1
pbmc.X_norm = sc.pp.normalize_total(pbmc, target_sum=1, inplace=False)['X']
# create new adata.var column contaning mean of each column of adata.X_norm above
# this is total normalized counts per gene a.k.a. 'mean_total_expression'
pbmc.var['mean_expression'] = np.ravel(pbmc.X_norm.mean(0))
    
# return pd.DataFrame of n top-ranked genes by mean expression
x = pd.DataFrame(pbmc.var.nlargest(200, 'mean_expression')['mean_expression'])

⑶ GO plot 그리는 법 (ver. 1) (비추천)

# Reference
## https://github.com/mousepixels/sanbomics_scripts/blob/main/GO_in_python.ipynb
## https://snyk.io/advisor/python/goatools/functions/goatools.anno.genetogo_reader.Gene2GoReader

def GO(markers, max_terms = 6):
    # type(markers) = numpy.ndarray
    # dtype = object
    
    if len(markers) == 0:
        return
    
    from genes_ncbi_mus_musculus_proteincoding import GENEID2NT as GeneID2nt_mus
    from genes_ncbi_homo_sapiens_proteincoding import GENEID2NT as GeneID2nt_homo
    
    from goatools.base import download_go_basic_obo
    from goatools.base import download_ncbi_associations
    from goatools.obo_parser import GODag
    from goatools.anno.genetogo_reader import Gene2GoReader
    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    
    obo_fname = download_go_basic_obo()
    fin_gene2go = download_ncbi_associations()
    obodag = GODag("go-basic.obo")
    
    #run one time to initialize
    mapper = {}
    
    if markers[0] == markers[0].upper(): # Homo sapiens
        for key in GeneID2nt_homo:
            mapper[GeneID2nt_homo[key].Symbol] = GeneID2nt_homo[key].GeneID
    else:  # Mus musculus
        for key in GeneID2nt_mus:
            mapper[GeneID2nt_mus[key].Symbol] = GeneID2nt_mus[key].GeneID
        
    inv_map = {v: k for k, v in mapper.items()}
        
    if markers[0] == markers[0].upper():
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
    else:
        objanno = Gene2GoReader(fin_gene2go, taxids=[10090])
    
    ns2assoc = objanno.get_ns2assc()

    goeaobj = ''
    if markers[0] == markers[0].upper():
        goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_homo.keys(), # List of human protein-coding genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = 0.05, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method
    else:
        goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_mus.keys(), # List of mouse protein-coding genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = 0.05, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method        
        
    GO_items = []

    temp = goeaobj.ns2objgoea['BP'].assoc
    for item in temp:
        GO_items += temp[item]
    
    temp = goeaobj.ns2objgoea['CC'].assoc
    for item in temp:
        GO_items += temp[item]
    
    temp = goeaobj.ns2objgoea['MF'].assoc
    for item in temp:
        GO_items += temp[item]
            
    def go_it(test_genes):
        print(f'input genes: {len(test_genes)}')
    
        mapped_genes = []
        for gene in test_genes:
            try:
                mapped_genes.append(mapper[gene])
            except:
                pass
        print(f'mapped genes: {len(mapped_genes)}')
        
        goea_results_all = goeaobj.run_study(mapped_genes)                
        goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
                
        GO = pd.DataFrame(list(map(lambda x: [x.GO, x.goterm.name, x.goterm.namespace, x.p_uncorrected, x.p_fdr_bh,\
                       x.ratio_in_study[0], x.ratio_in_study[1], GO_items.count(x.GO), list(map(lambda y: inv_map[y], x.study_items)),\
                       ], goea_results_sig)), columns = ['GO', 'term', 'class', 'p', 'p_corr', 'n_genes',\
                                                        'n_study', 'n_go', 'study_genes'])

        GO = GO[GO.n_genes > 1]        
        return GO
        
    df = go_it(markers)    
    df['per'] = df.n_genes/df.n_go
    
    if df.shape[0] > 0:
        df = df.sort_values(by=['p_corr'], axis=0, ascending=True)
        
        if df.shape[0] > max_terms:
            df = df[0:max_terms]

        import matplotlib.pyplot as plt
        import matplotlib as mpl
        from matplotlib import cm
        import seaborn as sns
        import textwrap
    
        fig, ax = plt.subplots(figsize = (0.5, 2.75))
        cmap = mpl.cm.bwr_r
        norm = mpl.colors.Normalize(vmin = df.p_corr.min(), vmax = df.p_corr.max())
        mapper = cm.ScalarMappable(norm = norm, cmap = cm.bwr_r)
        cbl = mpl.colorbar.ColorbarBase(ax, cmap = cmap, norm = norm, orientation = 'vertical')
    
        plt.figure(figsize = (2,4))
        ax = sns.barplot(data = df, x = 'per', y = 'term', palette = mapper.to_rgba(df.p_corr.values))
        ax.set_yticklabels([textwrap.fill(e, 22) for e in df['term']])
        plt.show()

⑷ GO plot 그리는 법 (ver. 2) (추천)

def GO(gene_list):
    # reference : https://gseapy.readthedocs.io/en/latest/gseapy_example.html
    
    import gseapy
    from gseapy import barplot, dotplot
    
    if gene_list[0] == gene_list[0].upper():
        enr = gseapy.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt",
                     gene_sets=['GO_Biological_Process_2018','GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'],
                     organism='human', # don't forget to set organism to the one you desired! e.g. Yeast
                     outdir=None, # don't write to disk
                    )
    else:
        enr = gseapy.enrichr(gene_list=gene_list, # or "./tests/data/gene_list.txt",
             gene_sets=['GO_Biological_Process_2018','GO_Cellular_Component_2018', 'GO_Molecular_Function_2018'],
             organism='mouse', # don't forget to set organism to the one you desired! e.g. Yeast
             outdir=None, # don't write to disk
            )

    try:
        ax = dotplot(enr.results,
                      column="Adjusted P-value",
                      x='Gene_set', # set x axis, so you could do a multi-sample/library comparsion
                      size=10,
                      top_term=5,
                      figsize=(3,5),
                      title = "GO plot",
                      xticklabels_rot=45, # rotate xtick labels
                      show_ring=True, # set to False to revmove outer ring
                      marker='o',
                     )
    except:
        print("ValueError: Warning: No enrich terms when cutoff = 0.05")

① gseapy는 plt.subplot과 호환되지 않는 것으로 보임

⑸ convert from ensembl.gene to gene.symbol (표를 이용한 방식)

def ensembl_to_gene(ensembl_list):
    ar = []

    human = pd.read_csv("https://blog.kakaocdn.net/dna/29YTj/btrS5iG9QOH/AAAAAAAAAAAAAAAAAAAAAN4pRSZe7GCDazzdlM1-CZaP1sAzS5VyLRz_k7jWdsWg/human_genes_36601.tsv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=Vk%2BFepO1ZajYKMIyFp9R1wu3pds%3D&attach=1&knm=tfile.tsv", sep = '\t', header = None)
    mouse = pd.read_csv("https://blog.kakaocdn.net/dna/wkjwJ/btrS1QSgrpD/AAAAAAAAAAAAAAAAAAAAAApiPGgqYqc3L5hRzUcv5cENDYKf-iApHkrzL6osd2En/mouse_genes_32285.tsv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=DWuKe7bZQMzfXetlVF8g5w3PZrY%3D&attach=1&knm=tfile.tsv", sep = '\t', header = None)

    for i in range(len(ensembl_list)):
        if 'ENSG' in ensembl_list[i]:  # human gene
            index = human[0].tolist().index(ensembl_list[i])
            ar.append(human[1][index])
        elif 'ENSMUSG' in ensembl_list[i]:  # mouse gene
            index = mouse[0].tolist().index(ensembl_list[i])
            ar.append(mouse[1][index])

    return(ar)

⑹ convert from gene.symbol to ensembl.gene (표를 이용한 방식)

def gene_to_ensembl(gene_list):
    ar = []

    human = pd.read_csv("https://blog.kakaocdn.net/dna/29YTj/btrS5iG9QOH/AAAAAAAAAAAAAAAAAAAAAN4pRSZe7GCDazzdlM1-CZaP1sAzS5VyLRz_k7jWdsWg/human_genes_36601.tsv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=Vk%2BFepO1ZajYKMIyFp9R1wu3pds%3D&attach=1&knm=tfile.tsv", sep = '\t', header = None)
    mouse = pd.read_csv("https://blog.kakaocdn.net/dna/wkjwJ/btrS1QSgrpD/AAAAAAAAAAAAAAAAAAAAAApiPGgqYqc3L5hRzUcv5cENDYKf-iApHkrzL6osd2En/mouse_genes_32285.tsv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=DWuKe7bZQMzfXetlVF8g5w3PZrY%3D&attach=1&knm=tfile.tsv", sep = '\t', header = None)

    for i in range(len(gene_list)):
        if gene_list[i] == gene_list[i].upper():  # human gene
            index = human[1].tolist().index(gene_list[i])
            ar.append(human[0][index])
        else:  # mouse gene
            index = mouse[1].tolist().index(gene_list[i])
            ar.append(mouse[0][index])

    return(ar)

⑺ human 유전자, mouse 유전자 상호 간 변환

import pandas as pd
import numpy as np

def find_idx(my_list, e):
    for i in range(len(my_list)):
        if my_list[i] == e:
            return i
    return -1

def human_to_mouse(human_gene:list):
    hom = pd.read_csv("https://blog.kakaocdn.net/dna/cVeqsA/btrS1JMnxyX/AAAAAAAAAAAAAAAAAAAAABCTUnmheA_kwpz3FOFSmlNvgvdAqBuFvzXcYA8mxMk1/HOM_MouseHumanSequence.csv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=ZUl4wyGF%2FheH6YQGH4ppZUZw1wE%3D&attach=1&knm=tfile.csv")

    mouse_gene = []

    for i in range(len(human_gene)):
        index = find_idx(hom['Symbol'], human_gene[i])
        DB_Class_Key = hom['DB Class Key'][index]
        hom_ = hom[hom['DB Class Key'] == DB_Class_Key]
        hom__ = hom_[hom_['Common Organism Name'] == 'mouse, laboratory']
        mouse_gene.append(np.array(hom__['Symbol'])[0])

    return mouse_gene

def mouse_to_human(mouse_gene:list):
    hom <- pd.read_csv("https://blog.kakaocdn.net/dna/cVeqsA/btrS1JMnxyX/AAAAAAAAAAAAAAAAAAAAABCTUnmheA_kwpz3FOFSmlNvgvdAqBuFvzXcYA8mxMk1/HOM_MouseHumanSequence.csv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=ZUl4wyGF%2FheH6YQGH4ppZUZw1wE%3D&attach=1&knm=tfile.csv")

    human_gene = []

    for i in range(len(mouse_gene)):
        index = find_idx(hom['Symbol'], mouse_gene[i])
        DB_Class_Key = hom['DB Class Key'][index]
        hom_ = hom[hom['DB Class Key'] == DB_Class_Key]
        hom__ = hom_[hom_['Common Organism Name'] == 'human']
        mouse_gene.append(np.array(hom__['Symbol'])[0])

    return human_gene

⑻ tissue_dir 디렉토리 내 matrix.mtx, barcodes.tsv, features.tsv 및 spatial 폴더가 있을 때, Visium 데이터를 Python 상에서 읽는 코드

import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.image import imread
import numpy as np
import cv2
import math

from pathlib import Path, PurePath
from typing import Union, Dict, Optional, Tuple, BinaryIO
import json

def to_spatial(adata, path, library_id = 'TNBC1'):
    
    adata.uns["spatial"][library_id] = dict()
    path = Path(path)
    
    files = dict(
        tissue_positions_file=path / 'spatial/tissue_positions_list.csv',
        scalefactors_json_file=path /  'spatial/scalefactors_json.json',
        hires_image=path / 'spatial/tissue_hires_image.png',
        lowres_image=path / 'spatial/tissue_lowres_image.png',
    )

    # check if files exists, continue if images are missing
    for f in files.values():
        if not f.exists():
            if any(x in str(f) for x in ["hires_image", "lowres_image"]):
                logg.warning(
                    f"You seem to be missing an image file.\n"
                    f"Could not find '{f}'."
                )
            else:
                raise OSError(f"Could not find '{f}'")

    adata.uns["spatial"][library_id]['images'] = dict()
    for res in ['hires', 'lowres']:
        try:
            adata.uns["spatial"][library_id]['images'][res] = imread(
                str(files[f'{res}_image'])
            )
        except Exception:
            raise OSError(f"Could not find '{res}_image'")

    # read json scalefactors
    adata.uns["spatial"][library_id]['scalefactors'] = json.loads(
        files['scalefactors_json_file'].read_bytes()
    )

    #adata.uns["spatial"][library_id]["metadata"] = {
    #    k: (str(attrs[k], "utf-8") if isinstance(attrs[k], bytes) else attrs[k])
    #    for k in ("chemistry_description", "software_version")
    #    if k in attrs
    #}

    # read coordinates
    positions = pd.read_csv(files['tissue_positions_file'], header=None)
    positions.columns = [
        'barcode',
        'in_tissue',
        'array_row',
        'array_col',
        'pxl_col_in_fullres',
        'pxl_row_in_fullres',
    ]
    positions.index = positions['barcode']

    adata.obs = adata.obs.join(positions, how="left")
    
    adata.obsm['spatial'] = adata.obs[
        ['pxl_row_in_fullres', 'pxl_col_in_fullres']
    ].to_numpy()
    
    adata.obs.drop(
        columns=['barcode', 'pxl_row_in_fullres', 'pxl_col_in_fullres'],
        inplace=True,
    )

    return adata


def load_adata(tissue_dir):
    adata1 = sc.read_mtx(tissue_dir + 'matrix.mtx') 
    adata1 = adata1.transpose() 
    cellname = pd.read_csv(tissue_dir + 'barcodes.tsv', sep="\t", header=None, index_col=1, names=['idx', 'barcode']) 
    cellname.index = cellname['idx']
    featurename = pd.read_csv(tissue_dir + 'features.tsv', sep='\t', header=None, index_col=1, names=['gene_ids', 'feature_types'])
    adata1.var = featurename
    adata1.obs = cellname
    adata1.uns["spatial"] = dict()
    adata1 = to_spatial(adata1, tissue_dir, library_id='Library_1')
    adata1 = adata1[adata1.obs.in_tissue == 1,:]
    adata1.var_names_make_unique()
    adata1.obs['unnormalized_counts'] = adata1.X.sum(axis=1).A1
    sc.pp.normalize_total(adata1, inplace=True)
    adata1.obs['normalized_counts'] = adata1.X.sum(axis=1).A1
    sc.pp.log1p(adata1)
    sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=2000)

    return adata1

⑼ 유기화학 관련 파이썬 함수 모음

⑽ human, mouse 유전자 염기서열 호출 (단, human gene만 입력으로 하면 됨)

from Bio import Entrez
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import pandas as pd

def human_mouse_gene_sequence(input_gene_name):
    # Entrez email setup (required by NCBI)
    Entrez.email = "nate9389@gmail.com"
    
    # Function to fetch the sequence using gene ID
    def fetch_sequence(gene_id):
        handle = Entrez.efetch(db="nucleotide", id=gene_id, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        handle.close()
        return record.seq
    
    def get_gene_ids(input_gene, file_path="https://blog.kakaocdn.net/dna/cVeqsA/btrS1JMnxyX/AAAAAAAAAAAAAAAAAAAAABCTUnmheA_kwpz3FOFSmlNvgvdAqBuFvzXcYA8mxMk1/HOM_MouseHumanSequence.csv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=ZUl4wyGF%2FheH6YQGH4ppZUZw1wE%3D&attach=1&knm=tfile.csv"):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
    
        # Normalize the input gene name to uppercase to match the 'Symbol' column
        input_gene = input_gene.upper()
    
        # Find the row with the human gene symbol
        human_row = df[(df['Common Organism Name'] == 'human') & (df['Symbol'].str.upper() == input_gene)]
        
        # If the human gene is not found, return an error message
        if human_row.empty:
            return "Error: Human gene not found.", None
        
        # Find the corresponding mouse gene using the DB Class Key
        db_class_key = human_row['DB Class Key'].values[0]
        mouse_row = df[(df['Common Organism Name'] == 'mouse, laboratory') & (df['DB Class Key'] == db_class_key)]
        
        # If the mouse gene is not found, return an error message
        if mouse_row.empty:
            return None, "Error: Corresponding mouse gene not found."
        
        # Retrieve the gene IDs
        human_gene_id = human_row['Nucleotide RefSeq IDs'].values[0].split(',')[0] # Taking the first RefSeq ID if multiple are present
        mouse_gene_id = mouse_row['Nucleotide RefSeq IDs'].values[0].split(',')[0] # Taking the first RefSeq ID if multiple are present
        
        return human_gene_id, mouse_gene_id

    # Get the human and mouse gene IDs
    human_gene_id, mouse_gene_id = get_gene_ids(input_gene_name)

    # Fetch the sequences
    human_sequence = fetch_sequence(human_gene_id)
    mouse_sequence = fetch_sequence(mouse_gene_id)

    return human_sequence, mouse_sequence

### Example
human, mouse = human_mouse_gene_sequence("COL1A1")

⑾ image를 Visium anndata obs에 넣는 파이썬 코드

import scanpy as sc
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy

tissue_dir1 = './outs/'
adata1 = sc.read_visium(tissue_dir1)

img1 = cv2.imread(tissue_dir1 + 'spatial/tissue_hires_image.png')
img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)

hires = adata1.uns['spatial']['Parent_Visium_Human_OvarianCancer']['images']['hires']
tissue_hires_scalef = adata1.uns['spatial']['Parent_Visium_Human_OvarianCancer']['scalefactors']['tissue_hires_scalef']
coords = adata1.obsm['spatial'] * tissue_hires_scalef
adata1.obs['img'] = adata1.obs['in_tissue'] # just making a variable

for i in range(adata1.shape[0]):
    x_coord = int(np.round(coords[i][0]))
    y_coord = int(np.round(coords[i][1]))

    # Ensure the coordinates are within the bounds of the image
    x_coord = max(0, min(x_coord, hires.shape[1] - 1))
    y_coord = max(0, min(y_coord, hires.shape[0] - 1))

    # Extract the intensity of the pixel (for simplicity, using the red channel)
    intensity = hires[y_coord, x_coord, 0]  # Red channel
    adata1.obs['img'][i] = intensity
    
sc.pl.spatial(adata1, color = 'img', img_key=None, spot_size=300)

⑿ Visium ST 데이터를 ann data로 읽어들인 뒤 데이터 QC 작업을 하는 코드

def QC(tissue_dir):
    metafile = None
    adata1 = sc.read_visium(tissue_dir) 
    ### alternatively ###
    # adata1 = sc.read_10x_mtx(tissue_dir) 

    adata1.var_names_make_unique() 
    sc.pp.normalize_total(adata1, inplace=True) 
    sc.pp.log1p(adata1)
    # sc.pp.highly_variable_genes(adata1, flavor="seurat", n_top_genes=17189)
    adata1.obs['n_counts'] = adata1.X.sum(axis=1).A1
    sc.pl.spatial(adata1, img_key="hires", color='n_counts', alpha=0.7, cmap='jet')
    
    return adata1

⒀ sc.pl.spatial 크기 확대하기

fig, ax = plt.subplots(figsize=(10,10))
sc.pl.spatial(adata, color = 'total_counts', ax=ax, show=False)
plt.show()

⒁ sc.pl.spatial의 color bar를 제거하기

import matplotlib.pyplot as plt
import scanpy as sc

sc.pl.spatial(adata, img_key="hires", color='score', alpha=1.0, cmap='viridis', show=False)
fig = plt.gcf()
ax = fig.axes

# Find and remove the colorbar. The colorbar is usually the last axis in the figure.
if len(ax) > 1:  # If there's more than one axis (i.e., the colorbar is present)
    fig.delaxes(ax[-1])  # Remove the last axis (the colorbar)

plt.show()

⒂ adata를 transpose하기 : gene name과 barcode name 뒤집기

import scanpy as sc
import pandas as pd

# Assuming 'data' is your AnnData object
# Load your AnnData object if it's not already in memory
# data = sc.read_h5ad('/path/to/your/data.h5ad')

# Transpose the data matrix and swap the 'obs' and 'var' DataFrames
adata_transposed = sc.AnnData(X=data.X.T, obs=data.var, var=data.obs)

# If 'obs_names' and 'var_names' need to be retained, you might want to transfer them as well
adata_transposed.obs_names = data.var_names
adata_transposed.var_names = data.obs_names

# Now 'adata_transposed' is the transposed version of ‘data'

⒃ matrix.mtx, barcodes.tsv, features.tsv (or genes.tsv)가 있을 때 h5ad 생성하기 : 단, 세 파일 모두 prefix가 동일하다고 가정

import os
import glob
from scipy.io import mmread
import pandas as pd
import scanpy as sc

# Set the target directory here
target_directory = '/path/to/your/directory'

# Iterate over all .mtx files in the target directory
for mtx_file in glob.glob(os.path.join(target_directory, '*matrix.mtx')):
    prefix = mtx_file[:-10]  # Remove 'matrix.mtx' to get the prefix
    
    # Construct file names for barcodes and genes
    barcodes_file = prefix + 'barcodes.tsv'
    features_file = prefix + 'features.tsv'
    genes_file = prefix + 'features.tsv' if os.path.exists(prefix + 'features.tsv') else prefix + 'genes.tsv'
    
    # Check if all files exist
    if os.path.exists(mtx_file) and os.path.exists(barcodes_file) and os.path.exists(genes_file):
        # Load the data
        matrix = mmread(mtx_file).T.tocsr()  # Transpose to have genes as columns
        genes = pd.read_csv(genes_file, header=None, sep='\t')
        barcodes = pd.read_csv(barcodes_file, header=None, sep='\t')

        # Create an AnnData object
        adata = sc.AnnData(X=matrix, obs={'barcodes': barcodes[0].values}, var={'gene_ids': genes[0].values, 'gene_symbols': genes[1].values})
        
        # Output file name
        h5ad_file = prefix + '.h5ad'
        
        # Save the AnnData object
        adata.write(h5ad_file)
        
        # Delete the original files
        os.remove(mtx_file)
        os.remove(barcodes_file)
        os.remove(genes_file)

print("Conversion complete. Original files have been deleted.")

⒄ adata들의 리스트를 하나의 adata로 합치는 코드

# Note that adatas = [adata1, adata2, ...]
adata_concat = sc.concat(adatas, keys = "sample1 sample2 sample3 sample4 sample5 sample6".split(" "),index_unique='_', label = 'sample' )

⒅ h5 파일로부터 각 바코드별 리드수를 알려주는 코드

def get_mapped_reads(h5_file):
    with h5py.File(h5_file, 'r') as f:
        # Access the matrix group in the HDF5 file
        matrix_group = f['matrix']

        # Read the barcode, gene, and data datasets
        barcodes = np.array(matrix_group['barcodes']).astype(str)
        genes = np.array(matrix_group['features/name']).astype(str)
        data = np.array(matrix_group['data'])

        # Sum the data for each barcode to get the number of mapped reads per barcode
        barcode_read_counts = np.array(matrix_group['indptr'][1:] - matrix_group['indptr'][:-1])

        return barcodes, genes, barcode_read_counts

⒆ h5 파일로부터 각 피처(유전자)를 알려주는 코드

import h5py

# Path to your HDF5 file
file_path = f'./{datas[0]}/filtered_feature_bc_matrix.h5'  # Replace with your actual file path

# Open the HDF5 file and inspect its contents
with h5py.File(file_path, 'r') as f:
    # List all groups in the file
    print("Keys in HDF5 file:")
    print(list(f.keys()))

    # Access the 'matrix/features/name' to check what features are present
    features = f['matrix/features/name'][:]
    
    # Convert byte strings to normal strings (if necessary)
    features = [feature.decode('utf-8') for feature in features]
    
    print("\nFeatures in the HDF5 file:")
    for feature in features:
        print(feature)

⒇ omnipath를 이용하여 특정 유전자가 포함된 모든 ligand-receptor pair 출력하기

import pandas as pd
from typing import Optional
from anndata import AnnData
from omnipath.interactions import import_intercell_network

def load_ligand_receptor_pairs_omnipath(
        require_gene: Optional[str]=None
    ):
    """
    Load ligand-receptor pairs from the Omnipath database that are also
    in the provided dataset.

    Parameters
    ----------
    require_gene
        Only return ligand-receptor pairs where either the ligand or 
        receptor is this argument

    Returns
    -------
    List of ligand-receptor pair tuples
    """
    interactions = import_intercell_network(
        transmitter_params={"categories": "ligand"},
        receiver_params={"categories": "receptor"}
    )
    hom = pd.read_csv("https://blog.kakaocdn.net/dna/cVeqsA/btrS1JMnxyX/AAAAAAAAAAAAAAAAAAAAABCTUnmheA_kwpz3FOFSmlNvgvdAqBuFvzXcYA8mxMk1/HOM_MouseHumanSequence.csv?credential=yqXZFxpELC7KVnFOS48ylbz2pIh7yKj8&expires=1753973999&allow_ip=&allow_referer=&signature=ZUl4wyGF%2FheH6YQGH4ppZUZw1wE%3D&attach=1&knm=tfile.csv")
    genes_in_data = set(hom[hom['Common Organism Name'] == 'human']['Symbol'].tolist())
    lr_pairs = [
        (s, t)
        for s, t in zip(
            interactions['genesymbol_intercell_source'],
            interactions['genesymbol_intercell_target']
        )
        if s in genes_in_data and t in genes_in_data
    ]

    if require_gene:
        lr_pairs = [
            (s, t)
            for s, t in lr_pairs
            if s == require_gene or t == require_gene
        ]
    return lr_pairs
    
lrs = load_ligand_receptor_pairs_omnipath(require_gene='CXCL9')
print(lrs)

⒇ 주어진 시퀀스와 상보적인 염기서열 구하기

def reverse_complement(seq):
    ‘’'
    Returns the reverse complement of the input string representing a DNA sequence. 
    Works only with DNA sequences consisting solely of A, C, G, T or N characters. 
    Preserves the case of the input sequence.

    Args:
        seq (str): a DNA sequence string

    Returns:
        result (str): The reverse complement of the input DNA sequence string.
    '''

    # Used to easily translate strings
    compStrDNA = str.maketrans('ACGTacgt', 'TGCAtcga')

    # Translate then reverse seq
    return seq.translate(compStrDNA)[::-1]

reverse_complement('AaAC')
# GTtT

⒇ 여러 염기서열의 리스트로부터 PFM 구성하기

① ord(char) >> 1 & 3 결과 A, C, G, T는 각각 0, 1, 3, 2가 됨

② length ≥ max(len(seq))

import numpy as np

def build_pfm(sequences, length):
    """Function to build a PFM using entries from a fasta file

    Args:
        sequences (list): list of sequence strings
        length (int): size of pfm we are building

    Yields:
        pfm (numpy array): dimensions are 4xlength
    """
    
    pfm = np.zeros((4,length), dtype=int)
    for seq in sequences:
        counter = 0
        for char in list(seq):
            pfm[((ord(char) >> 1 & 3), counter)] += 1
            counter+=1
    pfm[3,:], pfm[2,:] = pfm[2,:], pfm[3,:].copy()
    return pfm

build_pfm(['AAA', 'ACCC'], 4)
# array([[2, 1, 1, 0],
#        [0, 1, 1, 1],
#        [0, 0, 0, 0],
#        [0, 0, 0, 0]])

⒇ PFM으로부터 PWM 구성하기 (단, p = 0.25, bg = 0.25로 일괄 지정)

def build_pwm(pfm):
    """Function to build a PWM from a PFM

    Args:
        pfm (numpy array): dimensions are 4xlength

    Yields:
        pwm (numpy array): dimensions are 4xlength
    """

    pwm = np.zeros((4,len(pfm[0])), dtype=float)
    sums = np.sum(pfm, axis = 0)
    p = .25
    bg = .25
    for i in range(4):
            for j in range(len(pwm[0])):
                    pwm[i,j] = np.log2((pfm[i,j] + p) / (sums[j]+p*4)) - np.log2(bg)
    return pwm

⒇ PWM에서 주어진 시퀀스의 스코어 구하기

def score_kmer(seq, pwm):
    """Function to score a kmer with a pwm
        kmer length is expected to be the same as pwm length

    Args:
        seq(str): kmer to score
        pwm (numpy array): pwm for scoring

    Yields:
        score (float): PWM score for kmer
    """
    
    score = 0    
    if len(seq) != len(pwm[0]):
        raise ValueError('K-mer and PWM are different lengths!')
    dna_to_index = str.maketrans('ACGT', '0123')    
    for j, val in enumerate(list(seq.translate(dna_to_index)), 0):
        score += pwm[int(val), j]

    return score

⒇ motif 탐색 알고리즘 : 길이가 긴 시퀀스에서 PWM 길이에 해당하는 모든 k-mer를 대상으로, +와 - 두 가닥을 모두 고려해 가장 높은 PWM 스코어를 갖는 k-mer를 탐색

def score_sequence(seq, pwm):
    """Function to score a nmer with a pwm
        This will scan sequence and score all 
        subsequences of length k with a pwm
        and return the maximum score

    Args:
        seq(str): nmer to score
        pwm (numpy array): pwm for scoring

    Yields:
        score (float): PWM score for nmer
        position (int): 0-based index of the best match location
    """
    
    max_score = -100
    max_index = 'None'
    max_strand = 'None'
    
    pwm_len = len(pwm[0])
    
    if len(seq) < pwm_len:
        raise ValueError('N-mer shorter than PWM!')
    
    for i in range(0, len(seq)-pwm_len+1):
        
        if score_kmer(seq[i:pwm_len+i], pwm) > max_score:
            max_score = score_kmer(seq[i:pwm_len+i], pwm)
            max_index = i
            max_strand = 0
            
        if score_kmer(reverse_complement(seq[i:pwm_len+i]), pwm) > max_score:
            max_score = score_kmer(reverse_complement(seq[i:pwm_len+i]), pwm)
            max_index = i
            max_strand = 1
    
    return max_score, max_index, max_strand

⒇ BWT pattern matching : 시퀀싱 alignment의 핵심 원리

def BWT(string):
  string += '\n'    
  t = [string[i:] + string[:i] for i in range(len(string))]
  t.sort()    
  bwt_string = ''.join([l[-1] for l in t])
  return bwt_string

def suffix_array(string):
  string += '\n'
  sa = [index for suffix, index in sorted((string[i:], i) for i in range(len(string)))]    
  return sa

def cal_count(string):
  mydict = {}
  for char in string:
    mydict[char] = 0
  for char in string:
    for key in mydict.keys():
      if key > char:            
        mydict[key] += 1
  return mydict

def cal_occur(bwt_string):
  mydict = {}
  flags = {}
  for char in bwt_string:
    mydict[char] = []
    flags[char] = 0
  for char in bwt_string:
    for key in mydict.keys():
      if key == char:
        flags[key] += 1
        mydict[key].append(flags[key])
      else:
        mydict[key].append(flags[key])
  return mydict

def update_range(lower, upper, count, occur, a):
  if lower == 0:
    lower_new = count[a] + 0 + 1
  else:
    lower_new = count[a] + occur[a][lower-1] + 1
  upper_new = count[a] + occur[a][upper]
  return lower_new, upper_new

def find_match(query, reference):
  def reverse(s):
    return s[::-1]

  count = cal_count(reference)
  occur = cal_occur(BWT(reference))

  lower = 0
  upper = len(BWT(reference))-1
  for char in reverse(query):
     if lower > upper: 
       break #no matching
     lower, upper = update_range(lower, upper, count, occur, char)
  sa = suffix_array(reference)
  matched_positions = sa[lower:upper+1]
  
  return sorted(matched_positions)

'''
reference = mississippi
bwt_reference = 'ipssm$pissii'
query = 'sis'
count: {'i': 0, 'm': 3 , 'p': 4, 's': 6 }
occur: {'$': [0,0,0,0,0,1,1,1,1,1,1,1],
        'i': [1,1,1,1,1,1,1,2,2,2,3,4],
        'm': [0,0,0,0,1,1,1,1,1,1,1,1],
        'p': [0,1,1,1,1,1,2,2,2,2,2,2],
        's': [0,0,1,2,2,2,2,2,3,4,4,4]}
find_match('sis', 'mississippi')
# [3]
'''

⒇ GSEA 파이썬 코드

def GSEA(gene_set, expression):
    """Perform GSEA testing on a gene set and expression data set

    Args:
        gene_set (list): list of gene names in a gene set
        expression (dict): dictionary with keys of gene names and values of expression

    Returns:
        enrichment (float): enrichment score for a given gene set
        scores (list): list of running enrichment scores as genes are processed
    """
    
    # Order genes in a descending orer according to expression 
    ranked_genes = sorted(expression.items(), key=lambda x: x[1], reverse=True)
    ranked_gene_names = [gene for gene, _ in ranked_genes]
    
    N = len(ranked_gene_names)
    Nh = len(gene_set)
    Nm = N - Nh
    
    if Nh == 0 or Nm == 0:
        raise ValueError("Gene set is empty or covers the entire gene universe.")
    
    hit_score = 1.0 / Nh
    miss_score = 1.0 / Nm
    
    running_score = 0.0
    scores = []

    for gene in ranked_gene_names:
        if gene in gene_set:
            running_score += hit_score
        else:
            running_score -= miss_score
        scores.append(running_score)
    
    # Enrichment score: maximum deviation from zero
    enrichment = max(scores)
    
    return enrichment, scores


def permute_GSEA(enrichment, gene_set, expression, permutations=1000, seed=42):
    """Compute p-value for GSEA results using random permutation testing.
    
    Args:
        enrichment (float): enrichment of the gene set
        gene_set (list): list of gene names in a gene set
        expression (dict): dictionary with keys of gene set names and values of expression
        permutations (int): number of permutations
        seed (int): seed for random.sample
        
    Returns:
        p_val (float): empirical p-pvalue for enrichment
        

    """
    import numpy as np 
    import random

    observed_enrichment = GSEA(gene_set, expression)[0]
    enrichments = []
    
    random.seed(seed)
    gene_list = list(expression.keys())
    
    for _ in range(permutations):
        permuted_gene_set = random.sample(gene_list, len(gene_set))
        permuted_enrichment = GSEA(permuted_gene_set, expression)[0]
        enrichments.append(permuted_enrichment)
    
    enrichments = np.array(enrichments)
    p_val = np.sum(enrichments >= enrichment) / (permutations + 1)

    return p_val

입력: 2022.05.07 00:43

수정: 2024.12.30 21:16

저작자표시 (새창열림)