Skip to content

Instantly share code, notes, and snippets.

@drcjar
Last active September 4, 2020 17:52
Show Gist options
  • Select an option

  • Save drcjar/836cdd0d4cfac42fc427d1700648dc78 to your computer and use it in GitHub Desktop.

Select an option

Save drcjar/836cdd0d4cfac42fc427d1700648dc78 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import statsmodels.formula.api as smf\n",
"%matplotlib inline\n",
"from scipy.stats import scoreatpercentile\n",
"import math\n",
"import numpy as np\n",
"\n",
"pd.options.display.float_format = '{:,.2f}'.format\n",
"\n",
"import scipy.stats as stats\n",
"stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
"\n",
"# jobs_dataframe.csv\n",
"# job_tasks_dataframe.csv\n",
"# flat_dataframe.csv"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('flat_dataframe.csv')\n",
"reorderjobcat = {1.:5 , 2.2:4, 3.:3 , 2.1:4, 2.3:4, 5.:1 , 4.:2}\n",
"df['lowest_peto_cat_reordered'] = df.lowest_peto_cat.map(reorderjobcat) # make it so highest n is highest exposed\n",
"df['median_ssec_int'] = df.median_ssec.astype(int)\n",
"\n",
"df.to_csv('flat_data_sans_genotype.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# table one"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>case</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>466.00</td>\n",
" <td>74.42</td>\n",
" <td>8.53</td>\n",
" <td>34.00</td>\n",
" <td>69.00</td>\n",
" <td>75.00</td>\n",
" <td>80.00</td>\n",
" <td>95.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>494.00</td>\n",
" <td>76.59</td>\n",
" <td>7.84</td>\n",
" <td>53.00</td>\n",
" <td>72.00</td>\n",
" <td>77.00</td>\n",
" <td>82.00</td>\n",
" <td>95.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 466.00 74.42 8.53 34.00 69.00 75.00 80.00 95.00\n",
"1 494.00 76.59 7.84 53.00 72.00 77.00 82.00 95.00"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('case').age.describe()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case ethnicity \n",
"0 White 449\n",
" Asian / Asian British 8\n",
" Black / African/ Caribbean/ Black British 7\n",
" Other ethnic group 2\n",
"1 White 479\n",
" Asian / Asian British 11\n",
" Black / African/ Caribbean/ Black British 2\n",
" Arab 1\n",
" Mixed / Multiple ethnic groups 1\n",
"Name: ethnicity, dtype: int64\n",
"case ethnicity \n",
"0 White 0.96\n",
" Asian / Asian British 0.02\n",
" Black / African/ Caribbean/ Black British 0.02\n",
" Other ethnic group 0.00\n",
"1 White 0.97\n",
" Asian / Asian British 0.02\n",
" Black / African/ Caribbean/ Black British 0.00\n",
" Arab 0.00\n",
" Mixed / Multiple ethnic groups 0.00\n",
"Name: ethnicity, dtype: float64\n"
]
}
],
"source": [
"print(df.groupby('case').ethnicity.value_counts() )\n",
"print(df.groupby('case').ethnicity.value_counts(normalize=True) )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case median_ssec_int\n",
"0 1 39\n",
" 2 61\n",
" 3 73\n",
" 4 52\n",
" 5 99\n",
" 6 88\n",
" 7 54\n",
"1 1 37\n",
" 2 59\n",
" 3 71\n",
" 4 55\n",
" 5 93\n",
" 6 113\n",
" 7 66\n",
"Name: median_ssec_int, dtype: int64\n",
"case median_ssec_int\n",
"0 1 0.08\n",
" 2 0.13\n",
" 3 0.16\n",
" 4 0.11\n",
" 5 0.21\n",
" 6 0.19\n",
" 7 0.12\n",
"1 1 0.07\n",
" 2 0.12\n",
" 3 0.14\n",
" 4 0.11\n",
" 5 0.19\n",
" 6 0.23\n",
" 7 0.13\n",
"Name: median_ssec_int, dtype: float64\n"
]
}
],
"source": [
"print (df.groupby('case')['median_ssec_int'].value_counts(sort=False))\n",
"print (df.groupby('case')['median_ssec_int'].value_counts(sort=False,normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case current_smoker\n",
"0 0 436\n",
" 1 30\n",
"1 0 484\n",
" 1 10\n",
"Name: current_smoker, dtype: int64\n",
"case current_smoker\n",
"0 0 0.94\n",
" 1 0.06\n",
"1 0 0.98\n",
" 1 0.02\n",
"Name: current_smoker, dtype: float64\n"
]
}
],
"source": [
"print (df.groupby('case')['current_smoker'].value_counts(sort=False))\n",
"print (df.groupby('case')['current_smoker'].value_counts(sort=False,normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case ever_smoked\n",
"0 0 139\n",
" 1 327\n",
"1 0 121\n",
" 1 373\n",
"Name: ever_smoked, dtype: int64\n",
"case ever_smoked\n",
"0 0 0.30\n",
" 1 0.70\n",
"1 0 0.24\n",
" 1 0.76\n",
"Name: ever_smoked, dtype: float64\n"
]
}
],
"source": [
"print (df.groupby('case')['ever_smoked'].value_counts(sort=False))\n",
"print (df.groupby('case')['ever_smoked'].value_counts(sort=False, normalize=True))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>case</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>301.00</td>\n",
" <td>26.16</td>\n",
" <td>24.94</td>\n",
" <td>1.00</td>\n",
" <td>9.00</td>\n",
" <td>21.00</td>\n",
" <td>36.00</td>\n",
" <td>165.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>350.00</td>\n",
" <td>27.60</td>\n",
" <td>26.03</td>\n",
" <td>1.00</td>\n",
" <td>10.25</td>\n",
" <td>21.00</td>\n",
" <td>37.75</td>\n",
" <td>220.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 301.00 26.16 24.94 1.00 9.00 21.00 36.00 165.00\n",
"1 350.00 27.60 26.03 1.00 10.25 21.00 37.75 220.00"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['packyrs'] > 0].groupby('case')['packyrs'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case mrc_score\n",
"0 0.00 254\n",
" 1.00 65\n",
" 2.00 80\n",
" 3.00 65\n",
" 4.00 2\n",
"1 0.00 35\n",
" 1.00 94\n",
" 2.00 165\n",
" 3.00 172\n",
" 4.00 28\n",
"Name: mrc_score, dtype: int64\n",
"case mrc_score\n",
"0 0.00 0.55\n",
" 1.00 0.14\n",
" 2.00 0.17\n",
" 3.00 0.14\n",
" 4.00 0.00\n",
"1 0.00 0.07\n",
" 1.00 0.19\n",
" 2.00 0.33\n",
" 3.00 0.35\n",
" 4.00 0.06\n",
"Name: mrc_score, dtype: float64\n"
]
}
],
"source": [
"print (df.groupby('case')['mrc_score'].value_counts(sort=False))\n",
"print (df.groupby('case')['mrc_score'].value_counts(sort=False, normalize=True))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# table two"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case peto_exposed\n",
"0 0 173\n",
" 1 293\n",
"1 0 166\n",
" 1 328\n",
"Name: peto_exposed, dtype: int64\n",
"case peto_exposed\n",
"0 0 0.37\n",
" 1 0.63\n",
"1 0 0.34\n",
" 1 0.66\n",
"Name: peto_exposed, dtype: float64\n"
]
}
],
"source": [
"print(df.groupby('case').peto_exposed.value_counts(sort=False) )\n",
"print(df.groupby('case').peto_exposed.value_counts(sort=False,normalize=True) )"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"case lowest_peto_cat_reordered\n",
"0 1 74\n",
" 2 99\n",
" 3 115\n",
" 4 127\n",
" 5 51\n",
"1 1 72\n",
" 2 94\n",
" 3 124\n",
" 4 140\n",
" 5 64\n",
"Name: lowest_peto_cat_reordered, dtype: int64\n",
"case lowest_peto_cat_reordered\n",
"0 1 0.16\n",
" 2 0.21\n",
" 3 0.25\n",
" 4 0.27\n",
" 5 0.11\n",
"1 1 0.15\n",
" 2 0.19\n",
" 3 0.25\n",
" 4 0.28\n",
" 5 0.13\n",
"Name: lowest_peto_cat_reordered, dtype: float64\n"
]
}
],
"source": [
"print(df.groupby('case').lowest_peto_cat_reordered.value_counts(sort=False) )\n",
"print(df.groupby('case').lowest_peto_cat_reordered.value_counts(normalize=True,sort=False) )"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.80\n",
"1 0.00\n",
"2 0.00\n",
"3 0.00\n",
"4 0.00\n",
" ... \n",
"955 0.00\n",
"956 0.00\n",
"957 0.00\n",
"958 12.42\n",
"959 0.00\n",
"Name: fibre_ml_exposure, Length: 960, dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.fibre_ml_exposure"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 108.00 617.34 3,078.48 0.00 0.22 4.37 57.36 23,785.71\n",
"1 122.00 1,117.93 5,641.48 0.00 0.30 5.86 62.98 50,761.90\n",
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 24.00 2,743.04 6,166.46 105.21 216.02 466.14 1,182.85 23,785.71\n",
"1 27.00 5,013.71 11,308.10 102.74 187.29 594.64 3,615.46 50,761.90\n",
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 29.00 2,281.78 5,682.68 52.86 144.00 290.83 1,126.29 23,785.71\n",
"1 34.00 3,995.43 10,240.56 55.15 121.27 297.05 3,214.13 50,761.90\n",
" count mean std min 25% 50% 75% max\n",
"case \n",
"0 35.00 1,896.92 5,227.91 27.12 66.32 222.00 1,067.43 23,785.71\n",
"1 40.00 3,401.64 9,528.10 28.81 68.42 187.29 1,660.67 50,761.90\n"
]
}
],
"source": [
"print(df[df.fibre_ml_exposure > 0].groupby('case').fibre_ml_exposure.describe())\n",
"print(df[df.fibre_ml_exposure >= 100].groupby('case').fibre_ml_exposure.describe())\n",
"print(df[df.fibre_ml_exposure >= 50].groupby('case').fibre_ml_exposure.describe())\n",
"print(df[df.fibre_ml_exposure >= 25].groupby('case').fibre_ml_exposure.describe())\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# table three"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.692044\n",
" Iterations 3\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 958\n",
"Method: MLE Df Model: 1\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.0009785\n",
"Time: 18:34:48 Log-Likelihood: -664.36\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 0.2540\n",
"================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------\n",
"Intercept -0.0413 0.109 -0.380 0.704 -0.254 0.172\n",
"peto_exposed 0.1541 0.135 1.141 0.254 -0.111 0.419\n",
"================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.78 1.19 0.96\n",
"peto_exposed 0.90 1.52 1.17\n"
]
}
],
"source": [
"model = smf.logit('case ~ peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.631284\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 936\n",
"Method: MLE Df Model: 23\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.08869\n",
"Time: 18:34:50 Log-Likelihood: -606.03\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 9.668e-15\n",
"===================================================================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------------------------------------------------------------\n",
"Intercept -3.2245 0.786 -4.101 0.000 -4.766 -1.684\n",
"C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust] -0.2512 0.496 -0.507 0.612 -1.223 0.721\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.3597 0.894 0.402 0.688 -1.393 2.112\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust] -0.0053 0.516 -0.010 0.992 -1.018 1.007\n",
"C(centre_name)[T.Heart of England NHS Foundation Trust] -0.1430 0.493 -0.290 0.772 -1.110 0.824\n",
"C(centre_name)[T.Imperial College Healthcare NHS Trust] -1.4553 0.506 -2.874 0.004 -2.448 -0.463\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS Trust] 2.7931 0.849 3.290 0.001 1.129 4.457\n",
"C(centre_name)[T.Morriston Hospital] -0.0182 0.605 -0.030 0.976 -1.203 1.167\n",
"C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust] 1.1716 0.925 1.267 0.205 -0.641 2.985\n",
"C(centre_name)[T.North Bristol NHS Trust] -0.2862 0.515 -0.556 0.578 -1.296 0.723\n",
"C(centre_name)[T.Nottingham University Hospitals NHS Trust] -0.2194 0.500 -0.438 0.661 -1.200 0.761\n",
"C(centre_name)[T.Papworth Hospital NHS Foundation Trust] -0.2108 0.501 -0.421 0.674 -1.192 0.770\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] -0.4812 0.733 -0.656 0.512 -1.918 0.955\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust] -0.2196 0.574 -0.383 0.702 -1.345 0.905\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] -0.3008 0.587 -0.512 0.608 -1.451 0.850\n",
"C(centre_name)[T.Southampton University Hospitals NHS Trust] -0.2792 0.485 -0.576 0.565 -1.230 0.671\n",
"C(centre_name)[T.Taunton and Somerset NHS Foundation Trust] -0.1191 0.579 -0.206 0.837 -1.253 1.015\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust] -0.0031 0.555 -0.006 0.995 -1.091 1.085\n",
"C(centre_name)[T.University Hospital of South Manchester] 0.1289 0.501 0.257 0.797 -0.853 1.111\n",
"C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust] 2.7552 1.127 2.446 0.014 0.547 4.963\n",
"C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust] -0.6483 0.678 -0.956 0.339 -1.977 0.681\n",
"age 0.0416 0.009 4.677 0.000 0.024 0.059\n",
"ever_smoked 0.3310 0.158 2.097 0.036 0.022 0.640\n",
"peto_exposed 0.0875 0.146 0.599 0.549 -0.199 0.374\n",
"===================================================================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.19 0.04\n",
"C(centre_name)[T.Aintree University Hospitals N... 0.29 2.06 0.78\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.25 8.27 1.43\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Found... 0.36 2.74 0.99\n",
"C(centre_name)[T.Heart of England NHS Foundatio... 0.33 2.28 0.87\n",
"C(centre_name)[T.Imperial College Healthcare NH... 0.09 0.63 0.23\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS T... 3.09 86.22 16.33\n",
"C(centre_name)[T.Morriston Hospital] 0.30 3.21 0.98\n",
"C(centre_name)[T.Norfolk and Norwich University... 0.53 19.78 3.23\n",
"C(centre_name)[T.North Bristol NHS Trust] 0.27 2.06 0.75\n",
"C(centre_name)[T.Nottingham University Hospital... 0.30 2.14 0.80\n",
"C(centre_name)[T.Papworth Hospital NHS Foundati... 0.30 2.16 0.81\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] 0.15 2.60 0.62\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Fou... 0.26 2.47 0.80\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] 0.23 2.34 0.74\n",
"C(centre_name)[T.Southampton University Hospita... 0.29 1.96 0.76\n",
"C(centre_name)[T.Taunton and Somerset NHS Found... 0.29 2.76 0.89\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospit... 0.34 2.96 1.00\n",
"C(centre_name)[T.University Hospital of South M... 0.43 3.04 1.14\n",
"C(centre_name)[T.University Hospitals Birmingha... 1.73 143.04 15.72\n",
"C(centre_name)[T.Worcestershire Acute Hospitals... 0.14 1.98 0.52\n",
"age 1.02 1.06 1.04\n",
"ever_smoked 1.02 1.90 1.39\n",
"peto_exposed 0.82 1.45 1.09\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked + C(centre_name) + peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.691792\n",
" Iterations 4\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 955\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.001342\n",
"Time: 18:35:11 Log-Likelihood: -664.12\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 0.7752\n",
"=====================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------------------------\n",
"Intercept -0.0274 0.166 -0.166 0.869 -0.352 0.297\n",
"C(lowest_peto_cat_reordered)[T.2] -0.0244 0.219 -0.111 0.911 -0.454 0.406\n",
"C(lowest_peto_cat_reordered)[T.3] 0.1027 0.210 0.489 0.625 -0.309 0.515\n",
"C(lowest_peto_cat_reordered)[T.4] 0.1249 0.206 0.606 0.544 -0.279 0.529\n",
"C(lowest_peto_cat_reordered)[T.5] 0.2545 0.250 1.017 0.309 -0.236 0.745\n",
"=====================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.70 1.35 0.97\n",
"C(lowest_peto_cat_reordered)[T.2] 0.63 1.50 0.98\n",
"C(lowest_peto_cat_reordered)[T.3] 0.73 1.67 1.11\n",
"C(lowest_peto_cat_reordered)[T.4] 0.76 1.70 1.13\n",
"C(lowest_peto_cat_reordered)[T.5] 0.79 2.11 1.29\n"
]
}
],
"source": [
"model = smf.logit('case ~ C(lowest_peto_cat_reordered)', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.630565\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 933\n",
"Method: MLE Df Model: 26\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.08973\n",
"Time: 18:35:59 Log-Likelihood: -605.34\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 6.451e-14\n",
"===================================================================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------------------------------------------------------------\n",
"Intercept -3.1901 0.798 -3.996 0.000 -4.755 -1.625\n",
"C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust] -0.2911 0.498 -0.584 0.559 -1.267 0.685\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.3443 0.894 0.385 0.700 -1.407 2.096\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust] -0.0436 0.518 -0.084 0.933 -1.059 0.972\n",
"C(centre_name)[T.Heart of England NHS Foundation Trust] -0.1706 0.495 -0.345 0.730 -1.141 0.800\n",
"C(centre_name)[T.Imperial College Healthcare NHS Trust] -1.5025 0.509 -2.951 0.003 -2.500 -0.505\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS Trust] 2.7754 0.849 3.268 0.001 1.111 4.440\n",
"C(centre_name)[T.Morriston Hospital] -0.0212 0.605 -0.035 0.972 -1.207 1.165\n",
"C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust] 1.1798 0.929 1.270 0.204 -0.641 3.000\n",
"C(centre_name)[T.North Bristol NHS Trust] -0.3192 0.516 -0.618 0.536 -1.331 0.693\n",
"C(centre_name)[T.Nottingham University Hospitals NHS Trust] -0.2394 0.502 -0.477 0.633 -1.223 0.745\n",
"C(centre_name)[T.Papworth Hospital NHS Foundation Trust] -0.2279 0.502 -0.454 0.650 -1.211 0.755\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] -0.5405 0.735 -0.735 0.462 -1.981 0.900\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust] -0.2412 0.576 -0.419 0.675 -1.370 0.887\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] -0.3293 0.588 -0.560 0.576 -1.483 0.824\n",
"C(centre_name)[T.Southampton University Hospitals NHS Trust] -0.2999 0.486 -0.617 0.537 -1.253 0.653\n",
"C(centre_name)[T.Taunton and Somerset NHS Foundation Trust] -0.1447 0.580 -0.250 0.803 -1.281 0.992\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust] -0.0439 0.557 -0.079 0.937 -1.135 1.047\n",
"C(centre_name)[T.University Hospital of South Manchester] 0.0894 0.503 0.178 0.859 -0.897 1.076\n",
"C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust] 2.7657 1.127 2.453 0.014 0.556 4.975\n",
"C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust] -0.6496 0.680 -0.955 0.340 -1.983 0.683\n",
"C(lowest_peto_cat_reordered)[T.2] -0.1935 0.237 -0.816 0.415 -0.658 0.271\n",
"C(lowest_peto_cat_reordered)[T.3] -0.0767 0.228 -0.336 0.737 -0.524 0.371\n",
"C(lowest_peto_cat_reordered)[T.4] 0.0603 0.223 0.270 0.787 -0.377 0.498\n",
"C(lowest_peto_cat_reordered)[T.5] -0.1063 0.271 -0.393 0.694 -0.637 0.424\n",
"age 0.0430 0.009 4.758 0.000 0.025 0.061\n",
"ever_smoked 0.3333 0.158 2.104 0.035 0.023 0.644\n",
"===================================================================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.20 0.04\n",
"C(centre_name)[T.Aintree University Hospitals N... 0.28 1.98 0.75\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.24 8.13 1.41\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Found... 0.35 2.64 0.96\n",
"C(centre_name)[T.Heart of England NHS Foundatio... 0.32 2.23 0.84\n",
"C(centre_name)[T.Imperial College Healthcare NH... 0.08 0.60 0.22\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS T... 3.04 84.75 16.05\n",
"C(centre_name)[T.Morriston Hospital] 0.30 3.21 0.98\n",
"C(centre_name)[T.Norfolk and Norwich University... 0.53 20.09 3.25\n",
"C(centre_name)[T.North Bristol NHS Trust] 0.26 2.00 0.73\n",
"C(centre_name)[T.Nottingham University Hospital... 0.29 2.11 0.79\n",
"C(centre_name)[T.Papworth Hospital NHS Foundati... 0.30 2.13 0.80\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] 0.14 2.46 0.58\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Fou... 0.25 2.43 0.79\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] 0.23 2.28 0.72\n",
"C(centre_name)[T.Southampton University Hospita... 0.29 1.92 0.74\n",
"C(centre_name)[T.Taunton and Somerset NHS Found... 0.28 2.70 0.87\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospit... 0.32 2.85 0.96\n",
"C(centre_name)[T.University Hospital of South M... 0.41 2.93 1.09\n",
"C(centre_name)[T.University Hospitals Birmingha... 1.74 144.78 15.89\n",
"C(centre_name)[T.Worcestershire Acute Hospitals... 0.14 1.98 0.52\n",
"C(lowest_peto_cat_reordered)[T.2] 0.52 1.31 0.82\n",
"C(lowest_peto_cat_reordered)[T.3] 0.59 1.45 0.93\n",
"C(lowest_peto_cat_reordered)[T.4] 0.69 1.64 1.06\n",
"C(lowest_peto_cat_reordered)[T.5] 0.53 1.53 0.90\n",
"age 1.03 1.06 1.04\n",
"ever_smoked 1.02 1.90 1.40\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked + C(centre_name) + C(lowest_peto_cat_reordered)', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.690922\n",
" Iterations 3\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 958\n",
"Method: MLE Df Model: 1\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.002598\n",
"Time: 18:42:04 Log-Likelihood: -663.29\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 0.06305\n",
"===============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-------------------------------------------------------------------------------\n",
"Intercept -0.1387 0.124 -1.115 0.265 -0.382 0.105\n",
"ever_smoked 0.2703 0.146 1.857 0.063 -0.015 0.556\n",
"===============================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.68 1.11 0.87\n",
"ever_smoked 0.99 1.74 1.31\n"
]
}
],
"source": [
"model = smf.logit('case ~ ever_smoked', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.631471\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 937\n",
"Method: MLE Df Model: 22\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.08842\n",
"Time: 18:42:05 Log-Likelihood: -606.21\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 4.746e-15\n",
"===================================================================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------------------------------------------------------------\n",
"Intercept -3.1662 0.779 -4.065 0.000 -4.693 -1.640\n",
"C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust] -0.2398 0.495 -0.485 0.628 -1.210 0.730\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.3385 0.893 0.379 0.705 -1.413 2.090\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust] -0.0125 0.516 -0.024 0.981 -1.023 0.998\n",
"C(centre_name)[T.Heart of England NHS Foundation Trust] -0.1352 0.493 -0.274 0.784 -1.101 0.830\n",
"C(centre_name)[T.Imperial College Healthcare NHS Trust] -1.4626 0.506 -2.893 0.004 -2.453 -0.472\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS Trust] 2.7912 0.848 3.290 0.001 1.128 4.454\n",
"C(centre_name)[T.Morriston Hospital] -0.0092 0.604 -0.015 0.988 -1.193 1.174\n",
"C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust] 1.2009 0.923 1.301 0.193 -0.609 3.011\n",
"C(centre_name)[T.North Bristol NHS Trust] -0.2879 0.514 -0.560 0.576 -1.296 0.720\n",
"C(centre_name)[T.Nottingham University Hospitals NHS Trust] -0.2179 0.500 -0.436 0.663 -1.197 0.761\n",
"C(centre_name)[T.Papworth Hospital NHS Foundation Trust] -0.2182 0.500 -0.436 0.663 -1.198 0.762\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] -0.4710 0.733 -0.643 0.520 -1.907 0.965\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust] -0.2145 0.574 -0.374 0.708 -1.339 0.910\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] -0.2962 0.587 -0.505 0.614 -1.446 0.854\n",
"C(centre_name)[T.Southampton University Hospitals NHS Trust] -0.2776 0.484 -0.573 0.566 -1.227 0.672\n",
"C(centre_name)[T.Taunton and Somerset NHS Foundation Trust] -0.1170 0.578 -0.202 0.840 -1.250 1.016\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust] 0.0018 0.554 0.003 0.997 -1.085 1.088\n",
"C(centre_name)[T.University Hospital of South Manchester] 0.1253 0.501 0.250 0.802 -0.856 1.106\n",
"C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust] 2.7560 1.126 2.447 0.014 0.549 4.963\n",
"C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust] -0.6500 0.678 -0.959 0.337 -1.978 0.678\n",
"age 0.0415 0.009 4.668 0.000 0.024 0.059\n",
"ever_smoked 0.3403 0.157 2.166 0.030 0.032 0.648\n",
"===================================================================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.19 0.04\n",
"C(centre_name)[T.Aintree University Hospitals N... 0.30 2.08 0.79\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.24 8.08 1.40\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Found... 0.36 2.71 0.99\n",
"C(centre_name)[T.Heart of England NHS Foundatio... 0.33 2.29 0.87\n",
"C(centre_name)[T.Imperial College Healthcare NH... 0.09 0.62 0.23\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS T... 3.09 85.98 16.30\n",
"C(centre_name)[T.Morriston Hospital] 0.30 3.24 0.99\n",
"C(centre_name)[T.Norfolk and Norwich University... 0.54 20.30 3.32\n",
"C(centre_name)[T.North Bristol NHS Trust] 0.27 2.05 0.75\n",
"C(centre_name)[T.Nottingham University Hospital... 0.30 2.14 0.80\n",
"C(centre_name)[T.Papworth Hospital NHS Foundati... 0.30 2.14 0.80\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] 0.15 2.62 0.62\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Fou... 0.26 2.48 0.81\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] 0.24 2.35 0.74\n",
"C(centre_name)[T.Southampton University Hospita... 0.29 1.96 0.76\n",
"C(centre_name)[T.Taunton and Somerset NHS Found... 0.29 2.76 0.89\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospit... 0.34 2.97 1.00\n",
"C(centre_name)[T.University Hospital of South M... 0.43 3.02 1.13\n",
"C(centre_name)[T.University Hospitals Birmingha... 1.73 143.07 15.74\n",
"C(centre_name)[T.Worcestershire Acute Hospitals... 0.14 1.97 0.52\n",
"age 1.02 1.06 1.04\n",
"ever_smoked 1.03 1.91 1.41\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked + C(centre_name)', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.688784\n",
" Iterations 4\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 956\n",
"Method: MLE Df Model: 3\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.005684\n",
"Time: 18:42:07 Log-Likelihood: -661.23\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 0.05604\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept 2.236e-16 0.186 1.2e-15 1.000 -0.364 0.364\n",
"ever_smoked -0.0628 0.229 -0.274 0.784 -0.512 0.386\n",
"peto_exposed -0.2513 0.250 -1.004 0.316 -0.742 0.239\n",
"ever_smoked:peto_exposed 0.5373 0.299 1.799 0.072 -0.048 1.122\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.69 1.44 1.00\n",
"ever_smoked 0.60 1.47 0.94\n",
"peto_exposed 0.48 1.27 0.78\n",
"ever_smoked:peto_exposed 0.95 3.07 1.71\n"
]
}
],
"source": [
"model = smf.logit('case ~ ever_smoked*peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.629996\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 960\n",
"Model: Logit Df Residuals: 935\n",
"Method: MLE Df Model: 24\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.09055\n",
"Time: 18:42:09 Log-Likelihood: -604.80\n",
"converged: True LL-Null: -665.01\n",
"Covariance Type: nonrobust LLR p-value: 8.124e-15\n",
"===================================================================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------------------------------------------------------------------\n",
"Intercept -3.0969 0.789 -3.928 0.000 -4.642 -1.551\n",
"C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust] -0.2009 0.496 -0.405 0.686 -1.174 0.772\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.4814 0.901 0.534 0.593 -1.284 2.247\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust] 0.0454 0.517 0.088 0.930 -0.967 1.058\n",
"C(centre_name)[T.Heart of England NHS Foundation Trust] -0.0932 0.494 -0.189 0.850 -1.061 0.874\n",
"C(centre_name)[T.Imperial College Healthcare NHS Trust] -1.4030 0.506 -2.772 0.006 -2.395 -0.411\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS Trust] 2.8188 0.848 3.322 0.001 1.156 4.482\n",
"C(centre_name)[T.Morriston Hospital] 0.0153 0.604 0.025 0.980 -1.168 1.198\n",
"C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust] 1.1958 0.924 1.294 0.196 -0.615 3.007\n",
"C(centre_name)[T.North Bristol NHS Trust] -0.2484 0.515 -0.483 0.629 -1.257 0.760\n",
"C(centre_name)[T.Nottingham University Hospitals NHS Trust] -0.1411 0.502 -0.281 0.779 -1.125 0.843\n",
"C(centre_name)[T.Papworth Hospital NHS Foundation Trust] -0.1551 0.501 -0.310 0.757 -1.137 0.827\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] -0.3924 0.736 -0.533 0.594 -1.834 1.049\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust] -0.1423 0.575 -0.247 0.805 -1.270 0.985\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] -0.2344 0.588 -0.399 0.690 -1.387 0.918\n",
"C(centre_name)[T.Southampton University Hospitals NHS Trust] -0.2419 0.484 -0.499 0.617 -1.191 0.707\n",
"C(centre_name)[T.Taunton and Somerset NHS Foundation Trust] -0.0710 0.578 -0.123 0.902 -1.204 1.062\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust] 0.0117 0.554 0.021 0.983 -1.074 1.098\n",
"C(centre_name)[T.University Hospital of South Manchester] 0.1869 0.502 0.373 0.709 -0.796 1.170\n",
"C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust] 2.8145 1.128 2.495 0.013 0.604 5.025\n",
"C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust] -0.5971 0.678 -0.881 0.378 -1.926 0.732\n",
"age 0.0419 0.009 4.708 0.000 0.024 0.059\n",
"ever_smoked 0.0299 0.248 0.121 0.904 -0.456 0.516\n",
"peto_exposed -0.2690 0.270 -0.996 0.319 -0.798 0.260\n",
"ever_smoked:peto_exposed 0.5055 0.322 1.571 0.116 -0.125 1.136\n",
"===================================================================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.21 0.05\n",
"C(centre_name)[T.Aintree University Hospitals N... 0.31 2.16 0.82\n",
"C(centre_name)[T.Glasgow Royal Infirmary] 0.28 9.46 1.62\n",
"C(centre_name)[T.Guys’ and St Thomas’ NHS Found... 0.38 2.88 1.05\n",
"C(centre_name)[T.Heart of England NHS Foundatio... 0.35 2.40 0.91\n",
"C(centre_name)[T.Imperial College Healthcare NH... 0.09 0.66 0.25\n",
"C(centre_name)[T.Leeds Teaching Hospitals NHS T... 3.18 88.39 16.76\n",
"C(centre_name)[T.Morriston Hospital] 0.31 3.31 1.02\n",
"C(centre_name)[T.Norfolk and Norwich University... 0.54 20.22 3.31\n",
"C(centre_name)[T.North Bristol NHS Trust] 0.28 2.14 0.78\n",
"C(centre_name)[T.Nottingham University Hospital... 0.32 2.32 0.87\n",
"C(centre_name)[T.Papworth Hospital NHS Foundati... 0.32 2.29 0.86\n",
"C(centre_name)[T.Portsmouth Hospitals NHS Trust] 0.16 2.86 0.68\n",
"C(centre_name)[T.Royal Devon and Exeter NHS Fou... 0.28 2.68 0.87\n",
"C(centre_name)[T.Royal Infirmary of Edinburgh] 0.25 2.50 0.79\n",
"C(centre_name)[T.Southampton University Hospita... 0.30 2.03 0.79\n",
"C(centre_name)[T.Taunton and Somerset NHS Found... 0.30 2.89 0.93\n",
"C(centre_name)[T.The Newcastle Upon Tyne Hospit... 0.34 3.00 1.01\n",
"C(centre_name)[T.University Hospital of South M... 0.45 3.22 1.21\n",
"C(centre_name)[T.University Hospitals Birmingha... 1.83 152.17 16.68\n",
"C(centre_name)[T.Worcestershire Acute Hospitals... 0.15 2.08 0.55\n",
"age 1.02 1.06 1.04\n",
"ever_smoked 0.63 1.67 1.03\n",
"peto_exposed 0.45 1.30 0.76\n",
"ever_smoked:peto_exposed 0.88 3.11 1.66\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed + C(centre_name)', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# table four"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/pandas/core/indexing.py:376: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[key] = _infer_fill_value(value)\n",
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/pandas/core/indexing.py:494: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[item] = s\n"
]
}
],
"source": [
"gt = pd.read_csv('genotyping_cleaned.csv')\n",
"\n",
"transforms = {'Heterozygous':'(G;T)', 'Homozygous Allele 1':'(G;G)', 'Homozygous Allele 2':'(T;T)',\n",
" 'Homozygous Allele 1 ':'(G;G)', 'Homoxygous Allele 1':'(G;G)',\n",
" ' Homozygous Allele 2':'(T;T)', ' Homozygous Allele 1':'(G;G)'}\n",
"gt.genotype = gt.genotype.map(transforms)\n",
"\n",
"transforms = {\"(G;G)\":0, \"(G;T)\":1, \"(T;T)\":2}\n",
"\n",
"gt.genotype = gt.genotype.map(transforms) # change genotype to 0,1,2 (assumes additive model)\n",
"\n",
"# add genotype\n",
"genotype_lookup = gt[['participant_id', 'genotype']]\n",
"genotype_lookup.index = genotype_lookup['participant_id']\n",
"genotype_lookup = genotype_lookup['genotype'].to_dict()\n",
"\n",
"df['genotype'] = df.participant_id.map(genotype_lookup)\n",
"\n",
"df = df[df['genotype'].notnull()] # restrict analysis to participants with exposure data and genotype\n",
"\n",
"df.loc[:,'minor_allele'] = df['genotype'] > 0\n",
"\n",
"df.loc[:,'minor_allele'] = df.loc[:,'minor_allele'].astype(int)\n",
"\n",
"df.to_csv('flat_data_genotype_subset.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.688106\n",
" Iterations 4\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 818\n",
"Model: Logit Df Residuals: 814\n",
"Method: MLE Df Model: 3\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.006433\n",
"Time: 18:50:00 Log-Likelihood: -562.87\n",
"converged: True LL-Null: -566.52\n",
"Covariance Type: nonrobust LLR p-value: 0.06325\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -0.2091 0.205 -1.019 0.308 -0.611 0.193\n",
"ever_smoked 0.0483 0.252 0.192 0.848 -0.446 0.543\n",
"peto_exposed -0.1773 0.272 -0.653 0.514 -0.710 0.355\n",
"ever_smoked:peto_exposed 0.4471 0.324 1.378 0.168 -0.189 1.083\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.54 1.21 0.81\n",
"ever_smoked 0.64 1.72 1.05\n",
"peto_exposed 0.49 1.43 0.84\n",
"ever_smoked:peto_exposed 0.83 2.95 1.56\n"
]
}
],
"source": [
"model = smf.logit('case ~ ever_smoked*peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.681665\n",
" Iterations 4\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 818\n",
"Model: Logit Df Residuals: 813\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.01573\n",
"Time: 18:50:01 Log-Likelihood: -557.60\n",
"converged: True LL-Null: -566.52\n",
"Covariance Type: nonrobust LLR p-value: 0.001335\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -2.3285 0.694 -3.356 0.001 -3.689 -0.969\n",
"age 0.0279 0.009 3.206 0.001 0.011 0.045\n",
"ever_smoked 0.0716 0.254 0.281 0.778 -0.427 0.570\n",
"peto_exposed -0.1958 0.274 -0.715 0.474 -0.732 0.341\n",
"ever_smoked:peto_exposed 0.4725 0.327 1.446 0.148 -0.168 1.113\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.03 0.38 0.10\n",
"age 1.01 1.05 1.03\n",
"ever_smoked 0.65 1.77 1.07\n",
"peto_exposed 0.48 1.41 0.82\n",
"ever_smoked:peto_exposed 0.85 3.04 1.60\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.608986\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 818\n",
"Model: Logit Df Residuals: 814\n",
"Method: MLE Df Model: 3\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1207\n",
"Time: 18:50:02 Log-Likelihood: -498.15\n",
"converged: True LL-Null: -566.52\n",
"Covariance Type: nonrobust LLR p-value: 1.918e-29\n",
"=========================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------------\n",
"Intercept -0.7736 0.166 -4.663 0.000 -1.099 -0.448\n",
"peto_exposed 0.0394 0.204 0.193 0.847 -0.361 0.440\n",
"genotype 1.3115 0.239 5.480 0.000 0.842 1.781\n",
"peto_exposed:genotype 0.3604 0.302 1.194 0.233 -0.231 0.952\n",
"=========================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.33 0.64 0.46\n",
"peto_exposed 0.70 1.55 1.04\n",
"genotype 2.32 5.93 3.71\n",
"peto_exposed:genotype 0.79 2.59 1.43\n"
]
}
],
"source": [
"model = smf.logit('case ~ peto_exposed*genotype', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.601830\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 818\n",
"Model: Logit Df Residuals: 812\n",
"Method: MLE Df Model: 5\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1310\n",
"Time: 18:50:02 Log-Likelihood: -492.30\n",
"converged: True LL-Null: -566.52\n",
"Covariance Type: nonrobust LLR p-value: 2.872e-30\n",
"=========================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------------\n",
"Intercept -3.1222 0.753 -4.145 0.000 -4.598 -1.646\n",
"age 0.0285 0.009 3.014 0.003 0.010 0.047\n",
"ever_smoked 0.3028 0.173 1.749 0.080 -0.037 0.642\n",
"peto_exposed 0.0108 0.206 0.053 0.958 -0.393 0.415\n",
"genotype 1.3058 0.242 5.399 0.000 0.832 1.780\n",
"peto_exposed:genotype 0.3649 0.305 1.198 0.231 -0.232 0.962\n",
"=========================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.19 0.04\n",
"age 1.01 1.05 1.03\n",
"ever_smoked 0.96 1.90 1.35\n",
"peto_exposed 0.67 1.51 1.01\n",
"genotype 2.30 5.93 3.69\n",
"peto_exposed:genotype 0.79 2.62 1.44\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked + peto_exposed*genotype', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.623366\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 479\n",
"Model: Logit Df Residuals: 475\n",
"Method: MLE Df Model: 3\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.002356\n",
"Time: 18:50:03 Log-Likelihood: -298.59\n",
"converged: True LL-Null: -299.30\n",
"Covariance Type: nonrobust LLR p-value: 0.7031\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -1.0531 0.300 -3.512 0.000 -1.641 -0.465\n",
"ever_smoked 0.3882 0.363 1.068 0.285 -0.324 1.100\n",
"peto_exposed 0.2116 0.384 0.552 0.581 -0.540 0.963\n",
"ever_smoked:peto_exposed -0.2657 0.457 -0.582 0.561 -1.161 0.630\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.19 0.63 0.35\n",
"ever_smoked 0.72 3.00 1.47\n",
"peto_exposed 0.58 2.62 1.24\n",
"ever_smoked:peto_exposed 0.31 1.88 0.77\n"
]
}
],
"source": [
"model = smf.logit('case ~ ever_smoked*peto_exposed', data=df[df['genotype'] == 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.620634\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 479\n",
"Model: Logit Df Residuals: 474\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.006730\n",
"Time: 18:50:03 Log-Likelihood: -297.28\n",
"converged: True LL-Null: -299.30\n",
"Covariance Type: nonrobust LLR p-value: 0.4022\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -2.4977 0.954 -2.619 0.009 -4.367 -0.628\n",
"age 0.0193 0.012 1.604 0.109 -0.004 0.043\n",
"ever_smoked 0.3719 0.364 1.021 0.307 -0.342 1.086\n",
"peto_exposed 0.1741 0.385 0.452 0.651 -0.581 0.929\n",
"ever_smoked:peto_exposed -0.2138 0.459 -0.466 0.641 -1.114 0.686\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 0.53 0.08\n",
"age 1.00 1.04 1.02\n",
"ever_smoked 0.71 2.96 1.45\n",
"peto_exposed 0.56 2.53 1.19\n",
"ever_smoked:peto_exposed 0.33 1.99 0.81\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.576083\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 339\n",
"Model: Logit Df Residuals: 335\n",
"Method: MLE Df Model: 3\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.03332\n",
"Time: 18:50:04 Log-Likelihood: -195.29\n",
"converged: True LL-Null: -202.02\n",
"Covariance Type: nonrobust LLR p-value: 0.003738\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept 1.0296 0.368 2.795 0.005 0.308 1.752\n",
"ever_smoked -0.5516 0.434 -1.272 0.203 -1.401 0.298\n",
"peto_exposed -0.6931 0.471 -1.473 0.141 -1.615 0.229\n",
"ever_smoked:peto_exposed 1.5797 0.556 2.839 0.005 0.489 2.670\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 1.36 5.76 2.80\n",
"ever_smoked 0.25 1.35 0.58\n",
"peto_exposed 0.20 1.26 0.50\n",
"ever_smoked:peto_exposed 1.63 14.44 4.85\n"
]
}
],
"source": [
"model = smf.logit('case ~ ever_smoked*peto_exposed', data=df[df['genotype'] > 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.565499\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 339\n",
"Model: Logit Df Residuals: 334\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.05108\n",
"Time: 18:50:04 Log-Likelihood: -191.70\n",
"converged: True LL-Null: -202.02\n",
"Covariance Type: nonrobust LLR p-value: 0.0003737\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -2.1013 1.239 -1.696 0.090 -4.529 0.327\n",
"age 0.0405 0.015 2.628 0.009 0.010 0.071\n",
"ever_smoked -0.4274 0.441 -0.969 0.333 -1.292 0.437\n",
"peto_exposed -0.6464 0.477 -1.354 0.176 -1.582 0.289\n",
"ever_smoked:peto_exposed 1.5147 0.564 2.686 0.007 0.409 2.620\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 1.39 0.12\n",
"age 1.01 1.07 1.04\n",
"ever_smoked 0.27 1.55 0.65\n",
"peto_exposed 0.21 1.34 0.52\n",
"ever_smoked:peto_exposed 1.51 13.73 4.55\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] > 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
" 'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
" 'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
" 'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
" 'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
" 'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
" 'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
" 'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
" 'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
" 'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
" 'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
" 'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
" 'Aberdeen Royal Infirmary',\n",
" 'Aintree University Hospitals NHS Foundation Trust',\n",
" 'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
" 'Heart of England NHS Foundation Trust',\n",
" 'Imperial College Healthcare NHS Trust',\n",
" 'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
" 'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
" 'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
" 'Papworth Hospital NHS Foundation Trust',\n",
" 'Portsmouth Hospitals NHS Trust',\n",
" 'Royal Devon and Exeter NHS Foundation Trust',\n",
" 'Royal Infirmary of Edinburgh',\n",
" 'Southampton University Hospitals NHS Trust',\n",
" 'Taunton and Somerset NHS Foundation Trust',\n",
" 'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
" 'University Hospital of South Manchester',\n",
" 'University Hospitals Birmingham NHS Foundation Trust',\n",
" 'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
" 'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
" 'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
" 'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
" 'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
" 'minor_allele'],\n",
" dtype='object')"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"df[['ipfjes_id','case','minor_allele','genotype', 'peto_exposed','peto_dose','lowest_peto_cat_reordered', 'fibre_ml_exposure','ever_smoked', 'packyrs','age', 'ethnicity', 'ethcat', 'centre']].to_csv('cosseta.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# sensitivity analysis "
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df = df[df['centre'] != 2] # throw away centre 2 since it contains no genotyped cases"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.623254\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 808\n",
"Model: Logit Df Residuals: 785\n",
"Method: MLE Df Model: 22\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1005\n",
"Time: 18:50:07 Log-Likelihood: -503.59\n",
"converged: True LL-Null: -559.86\n",
"Covariance Type: nonrobust LLR p-value: 3.864e-14\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -0.1533 0.320 -0.478 0.632 -0.781 0.475\n",
"C(centre)[T.3.0] -1.1856 0.449 -2.642 0.008 -2.065 -0.306\n",
"C(centre)[T.4.0] -0.0720 0.310 -0.232 0.816 -0.680 0.536\n",
"C(centre)[T.5.0] 0.2419 0.340 0.711 0.477 -0.425 0.909\n",
"C(centre)[T.6.0] 0.0016 0.342 0.005 0.996 -0.668 0.671\n",
"C(centre)[T.7.0] -0.1405 0.429 -0.327 0.744 -0.982 0.701\n",
"C(centre)[T.8.0] -0.0454 0.329 -0.138 0.890 -0.691 0.600\n",
"C(centre)[T.9.0] -1.4948 0.512 -2.922 0.003 -2.498 -0.492\n",
"C(centre)[T.10.0] -1.2839 0.350 -3.664 0.000 -1.971 -0.597\n",
"C(centre)[T.11.0] -0.0920 0.489 -0.188 0.851 -1.050 0.866\n",
"C(centre)[T.12.0] 0.2417 0.803 0.301 0.763 -1.332 1.816\n",
"C(centre)[T.13.0] -0.3763 0.500 -0.753 0.452 -1.356 0.604\n",
"C(centre)[T.14.0] 0.1482 0.418 0.354 0.723 -0.671 0.968\n",
"C(centre)[T.15.0] 0.0146 0.440 0.033 0.974 -0.847 0.876\n",
"C(centre)[T.16.0] 3.4126 1.042 3.275 0.001 1.370 5.455\n",
"C(centre)[T.17.0] -0.2516 0.651 -0.386 0.699 -1.528 1.025\n",
"C(centre)[T.18.0] 2.5925 1.066 2.432 0.015 0.503 4.682\n",
"C(centre)[T.19.0] -0.2381 0.569 -0.418 0.676 -1.354 0.878\n",
"C(centre)[T.20.0] 0.0497 0.364 0.136 0.891 -0.664 0.764\n",
"C(centre)[T.21.0] 1.6392 1.108 1.480 0.139 -0.532 3.810\n",
"ever_smoked 0.1785 0.277 0.645 0.519 -0.364 0.721\n",
"peto_exposed -0.0675 0.297 -0.227 0.820 -0.649 0.514\n",
"ever_smoked:peto_exposed 0.2676 0.354 0.756 0.450 -0.427 0.962\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.46 1.61 0.86\n",
"C(centre)[T.3.0] 0.13 0.74 0.31\n",
"C(centre)[T.4.0] 0.51 1.71 0.93\n",
"C(centre)[T.5.0] 0.65 2.48 1.27\n",
"C(centre)[T.6.0] 0.51 1.96 1.00\n",
"C(centre)[T.7.0] 0.37 2.02 0.87\n",
"C(centre)[T.8.0] 0.50 1.82 0.96\n",
"C(centre)[T.9.0] 0.08 0.61 0.22\n",
"C(centre)[T.10.0] 0.14 0.55 0.28\n",
"C(centre)[T.11.0] 0.35 2.38 0.91\n",
"C(centre)[T.12.0] 0.26 6.15 1.27\n",
"C(centre)[T.13.0] 0.26 1.83 0.69\n",
"C(centre)[T.14.0] 0.51 2.63 1.16\n",
"C(centre)[T.15.0] 0.43 2.40 1.01\n",
"C(centre)[T.16.0] 3.94 233.96 30.34\n",
"C(centre)[T.17.0] 0.22 2.79 0.78\n",
"C(centre)[T.18.0] 1.65 107.98 13.36\n",
"C(centre)[T.19.0] 0.26 2.41 0.79\n",
"C(centre)[T.20.0] 0.51 2.15 1.05\n",
"C(centre)[T.21.0] 0.59 45.15 5.15\n",
"ever_smoked 0.70 2.06 1.20\n",
"peto_exposed 0.52 1.67 0.93\n",
"ever_smoked:peto_exposed 0.65 2.62 1.31\n"
]
}
],
"source": [
"model = smf.logit('case ~ C(centre) + ever_smoked*peto_exposed', data=df) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Maximum number of iterations has been exceeded.\n",
" Current function value: 0.552271\n",
" Iterations: 35\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 470\n",
"Model: Logit Df Residuals: 447\n",
"Method: MLE Df Model: 22\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1226\n",
"Time: 18:50:07 Log-Likelihood: -259.57\n",
"converged: False LL-Null: -295.82\n",
"Covariance Type: nonrobust LLR p-value: 2.649e-07\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -0.5072 0.450 -1.128 0.259 -1.388 0.374\n",
"C(centre)[T.3.0] -3.0007 1.071 -2.801 0.005 -5.100 -0.901\n",
"C(centre)[T.4.0] -0.4225 0.429 -0.985 0.325 -1.263 0.418\n",
"C(centre)[T.5.0] -0.3755 0.469 -0.800 0.424 -1.296 0.545\n",
"C(centre)[T.6.0] -0.8764 0.482 -1.816 0.069 -1.822 0.069\n",
"C(centre)[T.7.0] -0.5951 0.585 -1.018 0.309 -1.741 0.551\n",
"C(centre)[T.8.0] -0.3221 0.444 -0.725 0.468 -1.193 0.548\n",
"C(centre)[T.9.0] -1.5425 0.700 -2.205 0.027 -2.914 -0.171\n",
"C(centre)[T.10.0] -1.5149 0.464 -3.266 0.001 -2.424 -0.606\n",
"C(centre)[T.11.0] -0.2154 0.653 -0.330 0.741 -1.495 1.064\n",
"C(centre)[T.12.0] 0.1198 1.051 0.114 0.909 -1.941 2.180\n",
"C(centre)[T.13.0] -0.3851 0.640 -0.602 0.547 -1.639 0.869\n",
"C(centre)[T.14.0] -0.0364 0.593 -0.061 0.951 -1.199 1.126\n",
"C(centre)[T.15.0] -0.6553 0.675 -0.971 0.331 -1.978 0.667\n",
"C(centre)[T.16.0] 2.9878 1.078 2.771 0.006 0.875 5.101\n",
"C(centre)[T.17.0] -0.9279 0.875 -1.061 0.289 -2.642 0.786\n",
"C(centre)[T.18.0] 25.6231 1.93e+05 0.000 1.000 -3.79e+05 3.79e+05\n",
"C(centre)[T.19.0] -1.7448 1.114 -1.566 0.117 -3.929 0.439\n",
"C(centre)[T.20.0] -0.5751 0.514 -1.118 0.264 -1.584 0.433\n",
"C(centre)[T.21.0] 0.8732 1.266 0.690 0.490 -1.607 3.354\n",
"ever_smoked 0.4247 0.403 1.053 0.292 -0.366 1.215\n",
"peto_exposed 0.4169 0.417 1.000 0.317 -0.400 1.234\n",
"ever_smoked:peto_exposed -0.5584 0.502 -1.113 0.266 -1.542 0.425\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.25 1.45 0.60\n",
"C(centre)[T.3.0] 0.01 0.41 0.05\n",
"C(centre)[T.4.0] 0.28 1.52 0.66\n",
"C(centre)[T.5.0] 0.27 1.72 0.69\n",
"C(centre)[T.6.0] 0.16 1.07 0.42\n",
"C(centre)[T.7.0] 0.18 1.74 0.55\n",
"C(centre)[T.8.0] 0.30 1.73 0.72\n",
"C(centre)[T.9.0] 0.05 0.84 0.21\n",
"C(centre)[T.10.0] 0.09 0.55 0.22\n",
"C(centre)[T.11.0] 0.22 2.90 0.81\n",
"C(centre)[T.12.0] 0.14 8.85 1.13\n",
"C(centre)[T.13.0] 0.19 2.38 0.68\n",
"C(centre)[T.14.0] 0.30 3.08 0.96\n",
"C(centre)[T.15.0] 0.14 1.95 0.52\n",
"C(centre)[T.16.0] 2.40 164.15 19.84\n",
"C(centre)[T.17.0] 0.07 2.19 0.40\n",
"C(centre)[T.18.0] 0.00 inf 134,265,754,158.55\n",
"C(centre)[T.19.0] 0.02 1.55 0.17\n",
"C(centre)[T.20.0] 0.21 1.54 0.56\n",
"C(centre)[T.21.0] 0.20 28.61 2.39\n",
"ever_smoked 0.69 3.37 1.53\n",
"peto_exposed 0.67 3.43 1.52\n",
"ever_smoked:peto_exposed 0.21 1.53 0.57\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
" \"Check mle_retvals\", ConvergenceWarning)\n",
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"model = smf.logit('case ~ C(centre) + ever_smoked*peto_exposed', data=df[df['genotype'] == 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'warnflag': 1, 'converged': False, 'fopt': 0.5522708783537755, 'iterations': 35, 'Hessian': array([[-1.87391996e-01, -2.03838319e-03, -2.51523230e-02,\n",
" -1.75842094e-02, -1.57700251e-02, -8.71592841e-03,\n",
" -2.13369199e-02, -5.41636587e-03, -1.80127868e-02,\n",
" -6.51389565e-03, -2.12573330e-03, -6.82256037e-03,\n",
" -8.34569887e-03, -6.04801705e-03, -2.00045465e-03,\n",
" -3.18873754e-03, -5.68561746e-14, -1.85863557e-03,\n",
" -1.29700506e-02, -1.41720838e-03, -1.34309334e-01,\n",
" -1.24716615e-01, -9.23409703e-02],\n",
" [-2.03838319e-03, -2.03838319e-03, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.09343646e-03,\n",
" -1.35444644e-03, -4.69589213e-04],\n",
" [-2.51523230e-02, -0.00000000e+00, -2.51523230e-02,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.50543249e-02,\n",
" -1.50455575e-02, -1.05597900e-02],\n",
" [-1.75842094e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -1.75842094e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.04050687e-02,\n",
" -9.89202132e-03, -5.35536179e-03],\n",
" [-1.57700251e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -1.57700251e-02, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.06672985e-02,\n",
" -9.37671814e-03, -5.97893227e-03],\n",
" [-8.71592841e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -8.71592841e-03,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -7.36965281e-03,\n",
" -5.46619728e-03, -4.51813184e-03],\n",
" [-2.13369199e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -2.13369199e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.68674526e-02,\n",
" -1.83939952e-02, -1.48245412e-02],\n",
" [-5.41636587e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -5.41636587e-03, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -3.82255379e-03,\n",
" -3.81640773e-03, -2.65269556e-03],\n",
" [-1.80127868e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.80127868e-02,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.28528356e-02,\n",
" -1.13479195e-02, -8.38450647e-03],\n",
" [-6.51389565e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -6.51389565e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -4.58990461e-03,\n",
" -4.06900699e-03, -3.54933022e-03],\n",
" [-2.12573330e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -2.12573330e-03, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.59393390e-03,\n",
" -1.06227367e-03, -5.30474271e-04],\n",
" [-6.82256037e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -6.82256037e-03,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -5.88094914e-03,\n",
" -4.87229639e-03, -4.36934009e-03],\n",
" [-8.34569887e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -8.34569887e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -6.29715782e-03,\n",
" -5.76660723e-03, -4.70703381e-03],\n",
" [-6.04801705e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -6.04801705e-03, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -2.64624129e-03,\n",
" -4.50358625e-03, -2.64624129e-03],\n",
" [-2.00045465e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -2.00045465e-03,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.59178058e-03,\n",
" -1.17380115e-03, -1.06838426e-03],\n",
" [-3.18873754e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -3.18873754e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -2.77393858e-03,\n",
" -2.35609273e-03, -1.94129377e-03],\n",
" [-5.68561746e-14, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -5.68561746e-14, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -5.68561746e-14,\n",
" -3.96477177e-14, -3.96477177e-14],\n",
" [-1.85863557e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -1.85863557e-03,\n",
" -0.00000000e+00, -0.00000000e+00, -1.42288288e-03,\n",
" -1.16748592e-03, -9.14962486e-04],\n",
" [-1.29700506e-02, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -1.29700506e-02, -0.00000000e+00, -1.20907483e-02,\n",
" -7.30640273e-03, -6.82928294e-03],\n",
" [-1.41720838e-03, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
" -0.00000000e+00, -1.41720838e-03, -9.59146699e-04,\n",
" -1.41720838e-03, -9.59146699e-04],\n",
" [-1.34309334e-01, -1.09343646e-03, -1.50543249e-02,\n",
" -1.04050687e-02, -1.06672985e-02, -7.36965281e-03,\n",
" -1.68674526e-02, -3.82255379e-03, -1.28528356e-02,\n",
" -4.58990461e-03, -1.59393390e-03, -5.88094914e-03,\n",
" -6.29715782e-03, -2.64624129e-03, -1.59178058e-03,\n",
" -2.77393858e-03, -5.68561746e-14, -1.42288288e-03,\n",
" -1.20907483e-02, -9.59146699e-04, -1.34309334e-01,\n",
" -9.23409703e-02, -9.23409703e-02],\n",
" [-1.24716615e-01, -1.35444644e-03, -1.50455575e-02,\n",
" -9.89202132e-03, -9.37671814e-03, -5.46619728e-03,\n",
" -1.83939952e-02, -3.81640773e-03, -1.13479195e-02,\n",
" -4.06900699e-03, -1.06227367e-03, -4.87229639e-03,\n",
" -5.76660723e-03, -4.50358625e-03, -1.17380115e-03,\n",
" -2.35609273e-03, -3.96477177e-14, -1.16748592e-03,\n",
" -7.30640273e-03, -1.41720838e-03, -9.23409703e-02,\n",
" -1.24716615e-01, -9.23409703e-02],\n",
" [-9.23409703e-02, -4.69589213e-04, -1.05597900e-02,\n",
" -5.35536179e-03, -5.97893227e-03, -4.51813184e-03,\n",
" -1.48245412e-02, -2.65269556e-03, -8.38450647e-03,\n",
" -3.54933022e-03, -5.30474271e-04, -4.36934009e-03,\n",
" -4.70703381e-03, -2.64624129e-03, -1.06838426e-03,\n",
" -1.94129377e-03, -3.96477177e-14, -9.14962486e-04,\n",
" -6.82928294e-03, -9.59146699e-04, -9.23409703e-02,\n",
" -9.23409703e-02, -9.23409703e-02]]), 'score': array([ 1.70076719e-17, 4.72435330e-19, 0.00000000e+00, 2.71650315e-18,\n",
" -1.88974132e-18, 1.18108832e-18, 2.36217665e-18, 9.44870659e-19,\n",
" 1.41730599e-18, 0.00000000e+00, -2.36217665e-19, 1.41730599e-18,\n",
" 1.88974132e-18, 8.26761827e-19, 4.96057096e-18, 5.90544162e-19,\n",
" 5.68561746e-14, 0.00000000e+00, -9.44870659e-19, 0.00000000e+00,\n",
" 9.44870659e-18, 1.70076719e-17, 8.50383593e-18])}\n"
]
}
],
"source": [
"print(result.mle_retvals)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Maximum number of iterations has been exceeded.\n",
" Current function value: 0.497555\n",
" Iterations: 35\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 338\n",
"Model: Logit Df Residuals: 314\n",
"Method: MLE Df Model: 23\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1623\n",
"Time: 18:50:07 Log-Likelihood: -168.17\n",
"converged: False LL-Null: -200.76\n",
"Covariance Type: nonrobust LLR p-value: 6.619e-06\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -3.4449 1.476 -2.335 0.020 -6.337 -0.553\n",
"C(centre)[T.3.0] 0.0209 0.677 0.031 0.975 -1.307 1.349\n",
"C(centre)[T.4.0] 0.3200 0.501 0.639 0.523 -0.661 1.301\n",
"C(centre)[T.5.0] 1.2380 0.605 2.045 0.041 0.051 2.425\n",
"C(centre)[T.6.0] 1.8960 0.733 2.587 0.010 0.459 3.333\n",
"C(centre)[T.7.0] 0.9918 0.812 1.222 0.222 -0.599 2.582\n",
"C(centre)[T.8.0] 0.6448 0.557 1.158 0.247 -0.447 1.736\n",
"C(centre)[T.9.0] -1.2746 0.784 -1.626 0.104 -2.811 0.262\n",
"C(centre)[T.10.0] -0.0284 0.635 -0.045 0.964 -1.273 1.217\n",
"C(centre)[T.11.0] 0.6130 0.854 0.718 0.473 -1.060 2.286\n",
"C(centre)[T.12.0] 0.6041 1.343 0.450 0.653 -2.027 3.236\n",
"C(centre)[T.13.0] 0.2125 0.857 0.248 0.804 -1.467 1.892\n",
"C(centre)[T.14.0] 0.2247 0.635 0.354 0.723 -1.020 1.469\n",
"C(centre)[T.15.0] 0.7131 0.707 1.009 0.313 -0.672 2.098\n",
"C(centre)[T.16.0] 23.8352 4.3e+04 0.001 1.000 -8.43e+04 8.43e+04\n",
"C(centre)[T.17.0] 20.7693 1.91e+04 0.001 0.999 -3.74e+04 3.75e+04\n",
"C(centre)[T.18.0] 2.5712 1.152 2.232 0.026 0.313 4.829\n",
"C(centre)[T.19.0] 1.3860 1.173 1.182 0.237 -0.912 3.684\n",
"C(centre)[T.20.0] 1.0472 0.638 1.642 0.101 -0.203 2.297\n",
"C(centre)[T.21.0] 21.7733 3.4e+04 0.001 0.999 -6.65e+04 6.66e+04\n",
"age 0.0480 0.017 2.785 0.005 0.014 0.082\n",
"ever_smoked -0.2134 0.496 -0.431 0.667 -1.185 0.758\n",
"peto_exposed -0.7364 0.537 -1.371 0.170 -1.789 0.316\n",
"ever_smoked:peto_exposed 1.5718 0.636 2.472 0.013 0.326 2.818\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.00 0.58 0.03\n",
"C(centre)[T.3.0] 0.27 3.85 1.02\n",
"C(centre)[T.4.0] 0.52 3.67 1.38\n",
"C(centre)[T.5.0] 1.05 11.30 3.45\n",
"C(centre)[T.6.0] 1.58 28.01 6.66\n",
"C(centre)[T.7.0] 0.55 13.23 2.70\n",
"C(centre)[T.8.0] 0.64 5.68 1.91\n",
"C(centre)[T.9.0] 0.06 1.30 0.28\n",
"C(centre)[T.10.0] 0.28 3.38 0.97\n",
"C(centre)[T.11.0] 0.35 9.84 1.85\n",
"C(centre)[T.12.0] 0.13 25.42 1.83\n",
"C(centre)[T.13.0] 0.23 6.63 1.24\n",
"C(centre)[T.14.0] 0.36 4.35 1.25\n",
"C(centre)[T.15.0] 0.51 8.15 2.04\n",
"C(centre)[T.16.0] 0.00 inf 22,463,517,373.90\n",
"C(centre)[T.17.0] 0.00 inf 1,047,068,906.30\n",
"C(centre)[T.18.0] 1.37 125.10 13.08\n",
"C(centre)[T.19.0] 0.40 39.81 4.00\n",
"C(centre)[T.20.0] 0.82 9.95 2.85\n",
"C(centre)[T.21.0] 0.00 inf 2,857,809,871.09\n",
"age 1.01 1.09 1.05\n",
"ever_smoked 0.31 2.13 0.81\n",
"peto_exposed 0.17 1.37 0.48\n",
"ever_smoked:peto_exposed 1.39 16.74 4.82\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
" \"Check mle_retvals\", ConvergenceWarning)\n",
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + C(centre) + ever_smoked*peto_exposed', data=df[df['genotype'] > 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.508455\n",
" Iterations 7\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 338\n",
"Model: Logit Df Residuals: 332\n",
"Method: MLE Df Model: 5\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.1440\n",
"Time: 18:50:07 Log-Likelihood: -171.86\n",
"converged: True LL-Null: -200.76\n",
"Covariance Type: nonrobust LLR p-value: 3.460e-11\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -3.4483 1.326 -2.601 0.009 -6.047 -0.850\n",
"age 0.0450 0.016 2.780 0.005 0.013 0.077\n",
"distfromcentre 0.0422 0.009 4.702 0.000 0.025 0.060\n",
"ever_smoked -0.1933 0.472 -0.409 0.682 -1.118 0.732\n",
"peto_exposed -0.3531 0.505 -0.700 0.484 -1.342 0.636\n",
"ever_smoked:peto_exposed 1.1723 0.598 1.961 0.050 0.001 2.344\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.00 0.43 0.03\n",
"age 1.01 1.08 1.05\n",
"distfromcentre 1.02 1.06 1.04\n",
"ever_smoked 0.33 2.08 0.82\n",
"peto_exposed 0.26 1.89 0.70\n",
"ever_smoked:peto_exposed 1.00 10.42 3.23\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + distfromcentre + ever_smoked*peto_exposed', data=df[df['genotype'] > 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
" 'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
" 'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
" 'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
" 'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
" 'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
" 'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
" 'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
" 'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
" 'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
" 'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
" 'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
" 'Aberdeen Royal Infirmary',\n",
" 'Aintree University Hospitals NHS Foundation Trust',\n",
" 'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
" 'Heart of England NHS Foundation Trust',\n",
" 'Imperial College Healthcare NHS Trust',\n",
" 'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
" 'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
" 'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
" 'Papworth Hospital NHS Foundation Trust',\n",
" 'Portsmouth Hospitals NHS Trust',\n",
" 'Royal Devon and Exeter NHS Foundation Trust',\n",
" 'Royal Infirmary of Edinburgh',\n",
" 'Southampton University Hospitals NHS Trust',\n",
" 'Taunton and Somerset NHS Foundation Trust',\n",
" 'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
" 'University Hospital of South Manchester',\n",
" 'University Hospitals Birmingham NHS Foundation Trust',\n",
" 'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
" 'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
" 'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
" 'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
" 'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
" 'minor_allele'],\n",
" dtype='object')"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.555380\n",
" Iterations 6\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 338\n",
"Model: Logit Df Residuals: 327\n",
"Method: MLE Df Model: 10\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.06495\n",
"Time: 18:50:08 Log-Likelihood: -187.72\n",
"converged: True LL-Null: -200.76\n",
"Covariance Type: nonrobust LLR p-value: 0.003637\n",
"=================================================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-----------------------------------------------------------------------------------------------------------------\n",
"Intercept -2.5496 1.348 -1.891 0.059 -5.193 0.093\n",
"C(lowest_peto_cat_reordered)[T.2] 0.2687 0.750 0.358 0.720 -1.202 1.739\n",
"C(lowest_peto_cat_reordered)[T.3] -0.5650 0.674 -0.839 0.402 -1.885 0.755\n",
"C(lowest_peto_cat_reordered)[T.4] 0.0064 0.767 0.008 0.993 -1.497 1.510\n",
"C(lowest_peto_cat_reordered)[T.5] -1.3882 0.946 -1.467 0.142 -3.243 0.467\n",
"age 0.0445 0.016 2.810 0.005 0.013 0.076\n",
"ever_smoked -0.1762 0.656 -0.269 0.788 -1.461 1.109\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.2] -0.4357 0.892 -0.489 0.625 -2.183 1.312\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.3] 1.3211 0.832 1.588 0.112 -0.310 2.952\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.4] 1.1288 0.915 1.234 0.217 -0.665 2.922\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.5] 1.6952 1.082 1.567 0.117 -0.425 3.815\n",
"=================================================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 1.10 0.08\n",
"C(lowest_peto_cat_reordered)[T.2] 0.30 5.69 1.31\n",
"C(lowest_peto_cat_reordered)[T.3] 0.15 2.13 0.57\n",
"C(lowest_peto_cat_reordered)[T.4] 0.22 4.53 1.01\n",
"C(lowest_peto_cat_reordered)[T.5] 0.04 1.59 0.25\n",
"age 1.01 1.08 1.05\n",
"ever_smoked 0.23 3.03 0.84\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.2] 0.11 3.71 0.65\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.3] 0.73 19.14 3.75\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.4] 0.51 18.58 3.09\n",
"ever_smoked:C(lowest_peto_cat_reordered)[T.5] 0.65 45.39 5.45\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*C(lowest_peto_cat_reordered)', data=df[df['genotype'] > 0]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
" 'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
" 'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
" 'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
" 'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
" 'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
" 'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
" 'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
" 'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
" 'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
" 'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
" 'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
" 'Aberdeen Royal Infirmary',\n",
" 'Aintree University Hospitals NHS Foundation Trust',\n",
" 'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
" 'Heart of England NHS Foundation Trust',\n",
" 'Imperial College Healthcare NHS Trust',\n",
" 'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
" 'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
" 'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
" 'Papworth Hospital NHS Foundation Trust',\n",
" 'Portsmouth Hospitals NHS Trust',\n",
" 'Royal Devon and Exeter NHS Foundation Trust',\n",
" 'Royal Infirmary of Edinburgh',\n",
" 'Southampton University Hospitals NHS Trust',\n",
" 'Taunton and Somerset NHS Foundation Trust',\n",
" 'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
" 'University Hospital of South Manchester',\n",
" 'University Hospitals Birmingham NHS Foundation Trust',\n",
" 'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
" 'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
" 'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
" 'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
" 'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
" 'minor_allele'],\n",
" dtype='object')"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.572279\n",
" Iterations 5\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 302\n",
"Model: Logit Df Residuals: 297\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.06056\n",
"Time: 18:50:08 Log-Likelihood: -172.83\n",
"converged: True LL-Null: -183.97\n",
"Covariance Type: nonrobust LLR p-value: 0.0001760\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept -2.5108 1.308 -1.919 0.055 -5.075 0.053\n",
"age 0.0465 0.016 2.835 0.005 0.014 0.079\n",
"ever_smoked -0.5233 0.462 -1.132 0.258 -1.430 0.383\n",
"peto_exposed -0.8041 0.499 -1.613 0.107 -1.781 0.173\n",
"ever_smoked:peto_exposed 1.6815 0.590 2.850 0.004 0.525 2.838\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.01 1.05 0.08\n",
"age 1.01 1.08 1.05\n",
"ever_smoked 0.24 1.47 0.59\n",
"peto_exposed 0.17 1.19 0.45\n",
"ever_smoked:peto_exposed 1.69 17.08 5.37\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 1]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Maximum number of iterations has been exceeded.\n",
" Current function value: 0.371688\n",
" Iterations: 35\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: case No. Observations: 36\n",
"Model: Logit Df Residuals: 31\n",
"Method: MLE Df Model: 4\n",
"Date: Fri, 04 Sep 2020 Pseudo R-squ.: 0.07756\n",
"Time: 18:50:08 Log-Likelihood: -13.381\n",
"converged: False LL-Null: -14.506\n",
"Covariance Type: nonrobust LLR p-value: 0.6898\n",
"============================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"--------------------------------------------------------------------------------------------\n",
"Intercept 4.2880 5.075 0.845 0.398 -5.658 14.234\n",
"age -0.0439 0.060 -0.735 0.462 -0.161 0.073\n",
"ever_smoked 0.7230 1.684 0.429 0.668 -2.578 4.024\n",
"peto_exposed 22.6988 7.6e+04 0.000 1.000 -1.49e+05 1.49e+05\n",
"ever_smoked:peto_exposed -22.4422 7.6e+04 -0.000 1.000 -1.49e+05 1.49e+05\n",
"============================================================================================\n",
"Odds Ratios\n",
"======================\n",
" 2.5% 97.5% OR\n",
"Intercept 0.00 1,519,826.23 72.82\n",
"age 0.85 1.08 0.96\n",
"ever_smoked 0.08 55.92 2.06\n",
"peto_exposed 0.00 inf 7,210,463,004.56\n",
"ever_smoked:peto_exposed 0.00 inf 0.00\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
" \"Check mle_retvals\", ConvergenceWarning)\n",
"/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 2]) \n",
"result = model.fit()\n",
"print(result.summary())\n",
"print (\"Odds Ratios\")\n",
"print (\"======================\")\n",
"params = result.params\n",
"conf = result.conf_int()\n",
"conf['OR'] = params\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"print(np.exp(conf))"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>case</th>\n",
" <th>age</th>\n",
" <th>ever_smoked</th>\n",
" <th>peto_exposed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>case</th>\n",
" <td>1.00</td>\n",
" <td>-0.15</td>\n",
" <td>0.04</td>\n",
" <td>0.14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>age</th>\n",
" <td>-0.15</td>\n",
" <td>1.00</td>\n",
" <td>-0.18</td>\n",
" <td>-0.19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ever_smoked</th>\n",
" <td>0.04</td>\n",
" <td>-0.18</td>\n",
" <td>1.00</td>\n",
" <td>0.26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>peto_exposed</th>\n",
" <td>0.14</td>\n",
" <td>-0.19</td>\n",
" <td>0.26</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" case age ever_smoked peto_exposed\n",
"case 1.00 -0.15 0.04 0.14\n",
"age -0.15 1.00 -0.18 -0.19\n",
"ever_smoked 0.04 -0.18 1.00 0.26\n",
"peto_exposed 0.14 -0.19 0.26 1.00"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['genotype'] == 2][['case', 'age', 'ever_smoked', 'peto_exposed']].corr()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment