In [1]:
import pandas as pd
import numpy as np
In [5]:
integrin=pd.read_excel("gtex_integrin_7_organs.xlsx")
In [6]:
integrin
Out[6]:
Unnamed: 0 primary_site ITGA10 ITGAD ITGAM ITGA3 ITGBL1 ITGAE ITGA2 ITGB3 ... ITGA6 ITGA2B ITGB1 ITGAL ITGA9 ITGB5 ITGA8 ITGA4 ITGA1 ITGA11
0 GTEX-13QIC-0011-R1a-SM-5O9CJ Brain 0.5763 -6.5064 2.2573 0.7832 1.0363 4.6035 2.5731 -2.8262 ... 2.8562 1.3846 5.8430 1.1316 -0.7108 3.5387 -0.0725 -0.4521 0.2029 -2.8262
1 GTEX-1399S-1726-SM-5L3DI Lung 4.9137 -3.6259 4.7307 7.1584 1.7702 4.9556 1.9149 2.6067 ... 4.2412 4.1211 7.7256 4.4900 2.9281 6.1483 5.1867 2.6185 4.7856 -0.0277
2 GTEX-PWCY-1326-SM-48TCU Ovary 2.3953 -5.0116 1.4547 4.2593 -0.7346 4.4149 0.2642 1.5216 ... 3.6816 1.5465 7.2964 -0.9406 2.7742 5.0414 2.0325 0.7579 2.2573 1.2516
3 GTEX-QXCU-0626-SM-2TC69 Lung 4.0541 -2.3147 4.5053 7.5651 4.1788 4.1772 5.3695 1.8444 ... 4.9631 1.9149 7.9947 3.3911 2.8462 6.7683 4.1636 2.7951 5.3284 1.2147
4 GTEX-ZA64-1526-SM-5CVMD Breast 2.0569 -2.4659 3.3993 3.1311 3.0074 4.4977 -1.7809 2.7139 ... 4.7340 0.6332 7.3496 -0.9406 2.5338 6.5696 1.7229 -0.6416 3.1195 1.1050
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1982 GTEX-QMRM-0826-SM-3NB33 Lung 5.3067 -3.8160 4.9065 7.5810 5.8714 4.7345 2.6185 3.1095 ... 5.6080 3.7324 8.2849 4.6201 3.6440 6.7052 5.1094 3.3364 5.8153 1.6604
1983 GTEX-YFCO-1626-SM-4W1Z3 Prostate 2.9581 -4.6082 1.1641 4.6938 1.5902 5.8625 -0.5125 1.7617 ... 3.8798 -1.4699 7.5163 -0.3752 2.9562 5.3035 4.4304 -0.9406 3.6136 0.4233
1984 GTEX-1117F-2826-SM-5GZXL Breast 4.3184 -6.5064 1.0433 4.8440 3.5498 4.6809 1.0293 3.3478 ... 5.3256 -0.0725 7.7516 1.1382 2.1411 7.1132 0.3796 0.0854 3.8650 1.0151
1985 GTEX-Q2AG-2826-SM-2HMJQ Brain 3.4622 -5.5735 1.5013 5.4835 1.7702 4.7517 0.6790 -3.1714 ... 1.1960 4.1740 4.3002 0.5470 -0.9971 3.7982 -0.2498 1.4808 -0.5125 -0.5125
1986 GTEX-XV7Q-0426-SM-4BRVN Lung 2.5585 -1.7809 6.7916 6.5865 2.7051 4.9519 4.3618 3.1892 ... 3.5779 2.8974 7.7685 4.8294 1.9149 5.9989 2.4117 2.4198 4.2080 1.0007

1987 rows × 29 columns

In [9]:
brain_Lung_data = integrin[integrin['primary_site'].isin(['Brain', 'Lung'])]     #filter data by organ, display both brain and liver data

#rearrange data
brain_Lung_vertical = brain_Lung_expression_only.melt(id_vars = 'primary_site', var_name = 'integrin_gene', value_name = 'expression_levels')
In [8]:
brain_Lung_expression_only=brain_Lung_data.iloc[:,1:]
In [11]:
brain_Lung_vertical
Out[11]:
primary_site integrin_gene expression_levels
0 Brain ITGA10 0.5763
1 Lung ITGA10 4.9137
2 Lung ITGA10 4.0541
3 Lung ITGA10 6.0732
4 Lung ITGA10 4.2510
... ... ... ...
38875 Brain ITGA11 -2.2447
38876 Brain ITGA11 -2.5479
38877 Lung ITGA11 1.6604
38878 Brain ITGA11 -0.5125
38879 Lung ITGA11 1.0007

38880 rows × 3 columns

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=brain_Lung_expression_only[['ITGA10']]
y=brain_Lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 0.94
In [17]:
#Switch ITGA10 to ITGAb4 to see how it impacts the accuracy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=brain_Lung_expression_only[['ITGB4']]
y=brain_Lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")
Accuracy using ITGA10: 0.81
In [21]:
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = brain_Lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = brain_Lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung)')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [23]:
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = brain_Lung_expression_only[['ITGB4']]  # 👈 Use your chosen integrin
y = brain_Lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGB4 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [27]:
#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = brain_Lung_expression_only[['ITGB4', 'ITGA3']]  # 👈 Use your chosen integrin
y = brain_Lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) using ITGB4 nad ITGA10 expression')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = integrin[selected_genes]  # Assuming the last 27 columns are integrins
y = integrin['primary_site']

# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
Accuracy: 0.7939698492462312

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.77      1.00      0.87        10
       Brain       0.81      0.94      0.87       247
      Breast       0.64      0.41      0.50        44
       Liver       1.00      0.65      0.79        23
        Lung       0.76      0.88      0.82        43
       Ovary       0.50      0.10      0.17        10
    Prostate       0.75      0.14      0.24        21

    accuracy                           0.79       398
   macro avg       0.75      0.59      0.61       398
weighted avg       0.78      0.79      0.77       398


Confusion Matrix:
[[ 10   0   0   0   0   0   0]
 [  3 231   3   0   8   1   1]
 [  0  25  18   0   1   0   0]
 [  0   8   0  15   0   0   0]
 [  0   4   1   0  38   0   0]
 [  0   6   0   0   3   1   0]
 [  0  12   6   0   0   0   3]]
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
  warnings.warn(
In [ ]: