drcjar/ipfjes_paper_analysis.ipynb

## ipfjes_paper_analysis.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import statsmodels.formula.api as smf\n",
    "%matplotlib inline\n",
    "from scipy.stats import scoreatpercentile\n",
    "import math\n",
    "import numpy as np\n",
    "\n",
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "\n",
    "import scipy.stats as stats\n",
    "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n",
    "\n",
    "# jobs_dataframe.csv\n",
    "# job_tasks_dataframe.csv\n",
    "# flat_dataframe.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('flat_dataframe.csv')\n",
    "reorderjobcat = {1.:5 , 2.2:4, 3.:3 , 2.1:4, 2.3:4, 5.:1 , 4.:2}\n",
    "df['lowest_peto_cat_reordered'] = df.lowest_peto_cat.map(reorderjobcat) # make it so highest n is highest exposed\n",
    "df['median_ssec_int'] = df.median_ssec.astype(int)\n",
    "\n",
    "df.to_csv('flat_data_sans_genotype.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# table one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>case</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>466.00</td>\n",
       "      <td>74.42</td>\n",
       "      <td>8.53</td>\n",
       "      <td>34.00</td>\n",
       "      <td>69.00</td>\n",
       "      <td>75.00</td>\n",
       "      <td>80.00</td>\n",
       "      <td>95.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>494.00</td>\n",
       "      <td>76.59</td>\n",
       "      <td>7.84</td>\n",
       "      <td>53.00</td>\n",
       "      <td>72.00</td>\n",
       "      <td>77.00</td>\n",
       "      <td>82.00</td>\n",
       "      <td>95.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      count  mean  std   min   25%   50%   75%   max\n",
       "case                                                \n",
       "0    466.00 74.42 8.53 34.00 69.00 75.00 80.00 95.00\n",
       "1    494.00 76.59 7.84 53.00 72.00 77.00 82.00 95.00"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby('case').age.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  ethnicity                                \n",
      "0     White                                        449\n",
      "      Asian / Asian British                          8\n",
      "      Black / African/ Caribbean/ Black British      7\n",
      "      Other ethnic group                             2\n",
      "1     White                                        479\n",
      "      Asian / Asian British                         11\n",
      "      Black / African/ Caribbean/ Black British      2\n",
      "      Arab                                           1\n",
      "      Mixed / Multiple ethnic groups                 1\n",
      "Name: ethnicity, dtype: int64\n",
      "case  ethnicity                                \n",
      "0     White                                       0.96\n",
      "      Asian / Asian British                       0.02\n",
      "      Black / African/ Caribbean/ Black British   0.02\n",
      "      Other ethnic group                          0.00\n",
      "1     White                                       0.97\n",
      "      Asian / Asian British                       0.02\n",
      "      Black / African/ Caribbean/ Black British   0.00\n",
      "      Arab                                        0.00\n",
      "      Mixed / Multiple ethnic groups              0.00\n",
      "Name: ethnicity, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby('case').ethnicity.value_counts() )\n",
    "print(df.groupby('case').ethnicity.value_counts(normalize=True) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  median_ssec_int\n",
      "0     1                   39\n",
      "      2                   61\n",
      "      3                   73\n",
      "      4                   52\n",
      "      5                   99\n",
      "      6                   88\n",
      "      7                   54\n",
      "1     1                   37\n",
      "      2                   59\n",
      "      3                   71\n",
      "      4                   55\n",
      "      5                   93\n",
      "      6                  113\n",
      "      7                   66\n",
      "Name: median_ssec_int, dtype: int64\n",
      "case  median_ssec_int\n",
      "0     1                 0.08\n",
      "      2                 0.13\n",
      "      3                 0.16\n",
      "      4                 0.11\n",
      "      5                 0.21\n",
      "      6                 0.19\n",
      "      7                 0.12\n",
      "1     1                 0.07\n",
      "      2                 0.12\n",
      "      3                 0.14\n",
      "      4                 0.11\n",
      "      5                 0.19\n",
      "      6                 0.23\n",
      "      7                 0.13\n",
      "Name: median_ssec_int, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print (df.groupby('case')['median_ssec_int'].value_counts(sort=False))\n",
    "print (df.groupby('case')['median_ssec_int'].value_counts(sort=False,normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  current_smoker\n",
      "0     0                 436\n",
      "      1                  30\n",
      "1     0                 484\n",
      "      1                  10\n",
      "Name: current_smoker, dtype: int64\n",
      "case  current_smoker\n",
      "0     0                0.94\n",
      "      1                0.06\n",
      "1     0                0.98\n",
      "      1                0.02\n",
      "Name: current_smoker, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print (df.groupby('case')['current_smoker'].value_counts(sort=False))\n",
    "print (df.groupby('case')['current_smoker'].value_counts(sort=False,normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  ever_smoked\n",
      "0     0              139\n",
      "      1              327\n",
      "1     0              121\n",
      "      1              373\n",
      "Name: ever_smoked, dtype: int64\n",
      "case  ever_smoked\n",
      "0     0             0.30\n",
      "      1             0.70\n",
      "1     0             0.24\n",
      "      1             0.76\n",
      "Name: ever_smoked, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print (df.groupby('case')['ever_smoked'].value_counts(sort=False))\n",
    "print (df.groupby('case')['ever_smoked'].value_counts(sort=False, normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>case</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>301.00</td>\n",
       "      <td>26.16</td>\n",
       "      <td>24.94</td>\n",
       "      <td>1.00</td>\n",
       "      <td>9.00</td>\n",
       "      <td>21.00</td>\n",
       "      <td>36.00</td>\n",
       "      <td>165.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>350.00</td>\n",
       "      <td>27.60</td>\n",
       "      <td>26.03</td>\n",
       "      <td>1.00</td>\n",
       "      <td>10.25</td>\n",
       "      <td>21.00</td>\n",
       "      <td>37.75</td>\n",
       "      <td>220.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      count  mean   std  min   25%   50%   75%    max\n",
       "case                                                 \n",
       "0    301.00 26.16 24.94 1.00  9.00 21.00 36.00 165.00\n",
       "1    350.00 27.60 26.03 1.00 10.25 21.00 37.75 220.00"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['packyrs'] > 0].groupby('case')['packyrs'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  mrc_score\n",
      "0     0.00         254\n",
      "      1.00          65\n",
      "      2.00          80\n",
      "      3.00          65\n",
      "      4.00           2\n",
      "1     0.00          35\n",
      "      1.00          94\n",
      "      2.00         165\n",
      "      3.00         172\n",
      "      4.00          28\n",
      "Name: mrc_score, dtype: int64\n",
      "case  mrc_score\n",
      "0     0.00        0.55\n",
      "      1.00        0.14\n",
      "      2.00        0.17\n",
      "      3.00        0.14\n",
      "      4.00        0.00\n",
      "1     0.00        0.07\n",
      "      1.00        0.19\n",
      "      2.00        0.33\n",
      "      3.00        0.35\n",
      "      4.00        0.06\n",
      "Name: mrc_score, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print (df.groupby('case')['mrc_score'].value_counts(sort=False))\n",
    "print (df.groupby('case')['mrc_score'].value_counts(sort=False, normalize=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# table two"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  peto_exposed\n",
      "0     0               173\n",
      "      1               293\n",
      "1     0               166\n",
      "      1               328\n",
      "Name: peto_exposed, dtype: int64\n",
      "case  peto_exposed\n",
      "0     0              0.37\n",
      "      1              0.63\n",
      "1     0              0.34\n",
      "      1              0.66\n",
      "Name: peto_exposed, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby('case').peto_exposed.value_counts(sort=False) )\n",
    "print(df.groupby('case').peto_exposed.value_counts(sort=False,normalize=True) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "case  lowest_peto_cat_reordered\n",
      "0     1                             74\n",
      "      2                             99\n",
      "      3                            115\n",
      "      4                            127\n",
      "      5                             51\n",
      "1     1                             72\n",
      "      2                             94\n",
      "      3                            124\n",
      "      4                            140\n",
      "      5                             64\n",
      "Name: lowest_peto_cat_reordered, dtype: int64\n",
      "case  lowest_peto_cat_reordered\n",
      "0     1                           0.16\n",
      "      2                           0.21\n",
      "      3                           0.25\n",
      "      4                           0.27\n",
      "      5                           0.11\n",
      "1     1                           0.15\n",
      "      2                           0.19\n",
      "      3                           0.25\n",
      "      4                           0.28\n",
      "      5                           0.13\n",
      "Name: lowest_peto_cat_reordered, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby('case').lowest_peto_cat_reordered.value_counts(sort=False) )\n",
    "print(df.groupby('case').lowest_peto_cat_reordered.value_counts(normalize=True,sort=False) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      0.80\n",
       "1      0.00\n",
       "2      0.00\n",
       "3      0.00\n",
       "4      0.00\n",
       "       ... \n",
       "955    0.00\n",
       "956    0.00\n",
       "957    0.00\n",
       "958   12.42\n",
       "959    0.00\n",
       "Name: fibre_ml_exposure, Length: 960, dtype: float64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fibre_ml_exposure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      count     mean      std  min  25%  50%   75%       max\n",
      "case                                                        \n",
      "0    108.00   617.34 3,078.48 0.00 0.22 4.37 57.36 23,785.71\n",
      "1    122.00 1,117.93 5,641.48 0.00 0.30 5.86 62.98 50,761.90\n",
      "      count     mean       std    min    25%    50%      75%       max\n",
      "case                                                                  \n",
      "0     24.00 2,743.04  6,166.46 105.21 216.02 466.14 1,182.85 23,785.71\n",
      "1     27.00 5,013.71 11,308.10 102.74 187.29 594.64 3,615.46 50,761.90\n",
      "      count     mean       std   min    25%    50%      75%       max\n",
      "case                                                                 \n",
      "0     29.00 2,281.78  5,682.68 52.86 144.00 290.83 1,126.29 23,785.71\n",
      "1     34.00 3,995.43 10,240.56 55.15 121.27 297.05 3,214.13 50,761.90\n",
      "      count     mean      std   min   25%    50%      75%       max\n",
      "case                                                               \n",
      "0     35.00 1,896.92 5,227.91 27.12 66.32 222.00 1,067.43 23,785.71\n",
      "1     40.00 3,401.64 9,528.10 28.81 68.42 187.29 1,660.67 50,761.90\n"
     ]
    }
   ],
   "source": [
    "print(df[df.fibre_ml_exposure > 0].groupby('case').fibre_ml_exposure.describe())\n",
    "print(df[df.fibre_ml_exposure >= 100].groupby('case').fibre_ml_exposure.describe())\n",
    "print(df[df.fibre_ml_exposure >= 50].groupby('case').fibre_ml_exposure.describe())\n",
    "print(df[df.fibre_ml_exposure >= 25].groupby('case').fibre_ml_exposure.describe())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# table three"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.692044\n",
      "         Iterations 3\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      958\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:               0.0009785\n",
      "Time:                        18:34:48   Log-Likelihood:                -664.36\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                    0.2540\n",
      "================================================================================\n",
      "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "Intercept       -0.0413      0.109     -0.380      0.704      -0.254       0.172\n",
      "peto_exposed     0.1541      0.135      1.141      0.254      -0.111       0.419\n",
      "================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "              2.5%  97.5%   OR\n",
      "Intercept     0.78   1.19 0.96\n",
      "peto_exposed  0.90   1.52 1.17\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.631284\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      936\n",
      "Method:                           MLE   Df Model:                           23\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.08869\n",
      "Time:                        18:34:50   Log-Likelihood:                -606.03\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                 9.668e-15\n",
      "===================================================================================================================================================\n",
      "                                                                                      coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------------------------------------------------------------------\n",
      "Intercept                                                                          -3.2245      0.786     -4.101      0.000      -4.766      -1.684\n",
      "C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust]                -0.2512      0.496     -0.507      0.612      -1.223       0.721\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]                                           0.3597      0.894      0.402      0.688      -1.393       2.112\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust]                        -0.0053      0.516     -0.010      0.992      -1.018       1.007\n",
      "C(centre_name)[T.Heart of England NHS Foundation Trust]                            -0.1430      0.493     -0.290      0.772      -1.110       0.824\n",
      "C(centre_name)[T.Imperial College Healthcare NHS Trust]                            -1.4553      0.506     -2.874      0.004      -2.448      -0.463\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS Trust]                                2.7931      0.849      3.290      0.001       1.129       4.457\n",
      "C(centre_name)[T.Morriston Hospital]                                               -0.0182      0.605     -0.030      0.976      -1.203       1.167\n",
      "C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust]     1.1716      0.925      1.267      0.205      -0.641       2.985\n",
      "C(centre_name)[T.North Bristol NHS Trust]                                          -0.2862      0.515     -0.556      0.578      -1.296       0.723\n",
      "C(centre_name)[T.Nottingham University Hospitals NHS Trust]                        -0.2194      0.500     -0.438      0.661      -1.200       0.761\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundation Trust]                           -0.2108      0.501     -0.421      0.674      -1.192       0.770\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]                                   -0.4812      0.733     -0.656      0.512      -1.918       0.955\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust]                      -0.2196      0.574     -0.383      0.702      -1.345       0.905\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]                                     -0.3008      0.587     -0.512      0.608      -1.451       0.850\n",
      "C(centre_name)[T.Southampton University Hospitals NHS Trust]                       -0.2792      0.485     -0.576      0.565      -1.230       0.671\n",
      "C(centre_name)[T.Taunton and Somerset NHS Foundation Trust]                        -0.1191      0.579     -0.206      0.837      -1.253       1.015\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust]           -0.0031      0.555     -0.006      0.995      -1.091       1.085\n",
      "C(centre_name)[T.University Hospital of South Manchester]                           0.1289      0.501      0.257      0.797      -0.853       1.111\n",
      "C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust]              2.7552      1.127      2.446      0.014       0.547       4.963\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust]                         -0.6483      0.678     -0.956      0.339      -1.977       0.681\n",
      "age                                                                                 0.0416      0.009      4.677      0.000       0.024       0.059\n",
      "ever_smoked                                                                         0.3310      0.158      2.097      0.036       0.022       0.640\n",
      "peto_exposed                                                                        0.0875      0.146      0.599      0.549      -0.199       0.374\n",
      "===================================================================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                                    2.5%  97.5%    OR\n",
      "Intercept                                           0.01   0.19  0.04\n",
      "C(centre_name)[T.Aintree University Hospitals N...  0.29   2.06  0.78\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]           0.25   8.27  1.43\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Found...  0.36   2.74  0.99\n",
      "C(centre_name)[T.Heart of England NHS Foundatio...  0.33   2.28  0.87\n",
      "C(centre_name)[T.Imperial College Healthcare NH...  0.09   0.63  0.23\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS T...  3.09  86.22 16.33\n",
      "C(centre_name)[T.Morriston Hospital]                0.30   3.21  0.98\n",
      "C(centre_name)[T.Norfolk and Norwich University...  0.53  19.78  3.23\n",
      "C(centre_name)[T.North Bristol NHS Trust]           0.27   2.06  0.75\n",
      "C(centre_name)[T.Nottingham University Hospital...  0.30   2.14  0.80\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundati...  0.30   2.16  0.81\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]    0.15   2.60  0.62\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Fou...  0.26   2.47  0.80\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]      0.23   2.34  0.74\n",
      "C(centre_name)[T.Southampton University Hospita...  0.29   1.96  0.76\n",
      "C(centre_name)[T.Taunton and Somerset NHS Found...  0.29   2.76  0.89\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospit...  0.34   2.96  1.00\n",
      "C(centre_name)[T.University Hospital of South M...  0.43   3.04  1.14\n",
      "C(centre_name)[T.University Hospitals Birmingha...  1.73 143.04 15.72\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals...  0.14   1.98  0.52\n",
      "age                                                 1.02   1.06  1.04\n",
      "ever_smoked                                         1.02   1.90  1.39\n",
      "peto_exposed                                        0.82   1.45  1.09\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked + C(centre_name) + peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.691792\n",
      "         Iterations 4\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      955\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.001342\n",
      "Time:                        18:35:11   Log-Likelihood:                -664.12\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                    0.7752\n",
      "=====================================================================================================\n",
      "                                        coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-----------------------------------------------------------------------------------------------------\n",
      "Intercept                            -0.0274      0.166     -0.166      0.869      -0.352       0.297\n",
      "C(lowest_peto_cat_reordered)[T.2]    -0.0244      0.219     -0.111      0.911      -0.454       0.406\n",
      "C(lowest_peto_cat_reordered)[T.3]     0.1027      0.210      0.489      0.625      -0.309       0.515\n",
      "C(lowest_peto_cat_reordered)[T.4]     0.1249      0.206      0.606      0.544      -0.279       0.529\n",
      "C(lowest_peto_cat_reordered)[T.5]     0.2545      0.250      1.017      0.309      -0.236       0.745\n",
      "=====================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                   2.5%  97.5%   OR\n",
      "Intercept                          0.70   1.35 0.97\n",
      "C(lowest_peto_cat_reordered)[T.2]  0.63   1.50 0.98\n",
      "C(lowest_peto_cat_reordered)[T.3]  0.73   1.67 1.11\n",
      "C(lowest_peto_cat_reordered)[T.4]  0.76   1.70 1.13\n",
      "C(lowest_peto_cat_reordered)[T.5]  0.79   2.11 1.29\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ C(lowest_peto_cat_reordered)', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.630565\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      933\n",
      "Method:                           MLE   Df Model:                           26\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.08973\n",
      "Time:                        18:35:59   Log-Likelihood:                -605.34\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                 6.451e-14\n",
      "===================================================================================================================================================\n",
      "                                                                                      coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------------------------------------------------------------------\n",
      "Intercept                                                                          -3.1901      0.798     -3.996      0.000      -4.755      -1.625\n",
      "C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust]                -0.2911      0.498     -0.584      0.559      -1.267       0.685\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]                                           0.3443      0.894      0.385      0.700      -1.407       2.096\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust]                        -0.0436      0.518     -0.084      0.933      -1.059       0.972\n",
      "C(centre_name)[T.Heart of England NHS Foundation Trust]                            -0.1706      0.495     -0.345      0.730      -1.141       0.800\n",
      "C(centre_name)[T.Imperial College Healthcare NHS Trust]                            -1.5025      0.509     -2.951      0.003      -2.500      -0.505\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS Trust]                                2.7754      0.849      3.268      0.001       1.111       4.440\n",
      "C(centre_name)[T.Morriston Hospital]                                               -0.0212      0.605     -0.035      0.972      -1.207       1.165\n",
      "C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust]     1.1798      0.929      1.270      0.204      -0.641       3.000\n",
      "C(centre_name)[T.North Bristol NHS Trust]                                          -0.3192      0.516     -0.618      0.536      -1.331       0.693\n",
      "C(centre_name)[T.Nottingham University Hospitals NHS Trust]                        -0.2394      0.502     -0.477      0.633      -1.223       0.745\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundation Trust]                           -0.2279      0.502     -0.454      0.650      -1.211       0.755\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]                                   -0.5405      0.735     -0.735      0.462      -1.981       0.900\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust]                      -0.2412      0.576     -0.419      0.675      -1.370       0.887\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]                                     -0.3293      0.588     -0.560      0.576      -1.483       0.824\n",
      "C(centre_name)[T.Southampton University Hospitals NHS Trust]                       -0.2999      0.486     -0.617      0.537      -1.253       0.653\n",
      "C(centre_name)[T.Taunton and Somerset NHS Foundation Trust]                        -0.1447      0.580     -0.250      0.803      -1.281       0.992\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust]           -0.0439      0.557     -0.079      0.937      -1.135       1.047\n",
      "C(centre_name)[T.University Hospital of South Manchester]                           0.0894      0.503      0.178      0.859      -0.897       1.076\n",
      "C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust]              2.7657      1.127      2.453      0.014       0.556       4.975\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust]                         -0.6496      0.680     -0.955      0.340      -1.983       0.683\n",
      "C(lowest_peto_cat_reordered)[T.2]                                                  -0.1935      0.237     -0.816      0.415      -0.658       0.271\n",
      "C(lowest_peto_cat_reordered)[T.3]                                                  -0.0767      0.228     -0.336      0.737      -0.524       0.371\n",
      "C(lowest_peto_cat_reordered)[T.4]                                                   0.0603      0.223      0.270      0.787      -0.377       0.498\n",
      "C(lowest_peto_cat_reordered)[T.5]                                                  -0.1063      0.271     -0.393      0.694      -0.637       0.424\n",
      "age                                                                                 0.0430      0.009      4.758      0.000       0.025       0.061\n",
      "ever_smoked                                                                         0.3333      0.158      2.104      0.035       0.023       0.644\n",
      "===================================================================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                                    2.5%  97.5%    OR\n",
      "Intercept                                           0.01   0.20  0.04\n",
      "C(centre_name)[T.Aintree University Hospitals N...  0.28   1.98  0.75\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]           0.24   8.13  1.41\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Found...  0.35   2.64  0.96\n",
      "C(centre_name)[T.Heart of England NHS Foundatio...  0.32   2.23  0.84\n",
      "C(centre_name)[T.Imperial College Healthcare NH...  0.08   0.60  0.22\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS T...  3.04  84.75 16.05\n",
      "C(centre_name)[T.Morriston Hospital]                0.30   3.21  0.98\n",
      "C(centre_name)[T.Norfolk and Norwich University...  0.53  20.09  3.25\n",
      "C(centre_name)[T.North Bristol NHS Trust]           0.26   2.00  0.73\n",
      "C(centre_name)[T.Nottingham University Hospital...  0.29   2.11  0.79\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundati...  0.30   2.13  0.80\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]    0.14   2.46  0.58\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Fou...  0.25   2.43  0.79\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]      0.23   2.28  0.72\n",
      "C(centre_name)[T.Southampton University Hospita...  0.29   1.92  0.74\n",
      "C(centre_name)[T.Taunton and Somerset NHS Found...  0.28   2.70  0.87\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospit...  0.32   2.85  0.96\n",
      "C(centre_name)[T.University Hospital of South M...  0.41   2.93  1.09\n",
      "C(centre_name)[T.University Hospitals Birmingha...  1.74 144.78 15.89\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals...  0.14   1.98  0.52\n",
      "C(lowest_peto_cat_reordered)[T.2]                   0.52   1.31  0.82\n",
      "C(lowest_peto_cat_reordered)[T.3]                   0.59   1.45  0.93\n",
      "C(lowest_peto_cat_reordered)[T.4]                   0.69   1.64  1.06\n",
      "C(lowest_peto_cat_reordered)[T.5]                   0.53   1.53  0.90\n",
      "age                                                 1.03   1.06  1.04\n",
      "ever_smoked                                         1.02   1.90  1.40\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked + C(centre_name) + C(lowest_peto_cat_reordered)', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.690922\n",
      "         Iterations 3\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      958\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.002598\n",
      "Time:                        18:42:04   Log-Likelihood:                -663.29\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                   0.06305\n",
      "===============================================================================\n",
      "                  coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------\n",
      "Intercept      -0.1387      0.124     -1.115      0.265      -0.382       0.105\n",
      "ever_smoked     0.2703      0.146      1.857      0.063      -0.015       0.556\n",
      "===============================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "             2.5%  97.5%   OR\n",
      "Intercept    0.68   1.11 0.87\n",
      "ever_smoked  0.99   1.74 1.31\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ ever_smoked', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.631471\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      937\n",
      "Method:                           MLE   Df Model:                           22\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.08842\n",
      "Time:                        18:42:05   Log-Likelihood:                -606.21\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                 4.746e-15\n",
      "===================================================================================================================================================\n",
      "                                                                                      coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------------------------------------------------------------------\n",
      "Intercept                                                                          -3.1662      0.779     -4.065      0.000      -4.693      -1.640\n",
      "C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust]                -0.2398      0.495     -0.485      0.628      -1.210       0.730\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]                                           0.3385      0.893      0.379      0.705      -1.413       2.090\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust]                        -0.0125      0.516     -0.024      0.981      -1.023       0.998\n",
      "C(centre_name)[T.Heart of England NHS Foundation Trust]                            -0.1352      0.493     -0.274      0.784      -1.101       0.830\n",
      "C(centre_name)[T.Imperial College Healthcare NHS Trust]                            -1.4626      0.506     -2.893      0.004      -2.453      -0.472\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS Trust]                                2.7912      0.848      3.290      0.001       1.128       4.454\n",
      "C(centre_name)[T.Morriston Hospital]                                               -0.0092      0.604     -0.015      0.988      -1.193       1.174\n",
      "C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust]     1.2009      0.923      1.301      0.193      -0.609       3.011\n",
      "C(centre_name)[T.North Bristol NHS Trust]                                          -0.2879      0.514     -0.560      0.576      -1.296       0.720\n",
      "C(centre_name)[T.Nottingham University Hospitals NHS Trust]                        -0.2179      0.500     -0.436      0.663      -1.197       0.761\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundation Trust]                           -0.2182      0.500     -0.436      0.663      -1.198       0.762\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]                                   -0.4710      0.733     -0.643      0.520      -1.907       0.965\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust]                      -0.2145      0.574     -0.374      0.708      -1.339       0.910\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]                                     -0.2962      0.587     -0.505      0.614      -1.446       0.854\n",
      "C(centre_name)[T.Southampton University Hospitals NHS Trust]                       -0.2776      0.484     -0.573      0.566      -1.227       0.672\n",
      "C(centre_name)[T.Taunton and Somerset NHS Foundation Trust]                        -0.1170      0.578     -0.202      0.840      -1.250       1.016\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust]            0.0018      0.554      0.003      0.997      -1.085       1.088\n",
      "C(centre_name)[T.University Hospital of South Manchester]                           0.1253      0.501      0.250      0.802      -0.856       1.106\n",
      "C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust]              2.7560      1.126      2.447      0.014       0.549       4.963\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust]                         -0.6500      0.678     -0.959      0.337      -1.978       0.678\n",
      "age                                                                                 0.0415      0.009      4.668      0.000       0.024       0.059\n",
      "ever_smoked                                                                         0.3403      0.157      2.166      0.030       0.032       0.648\n",
      "===================================================================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                                    2.5%  97.5%    OR\n",
      "Intercept                                           0.01   0.19  0.04\n",
      "C(centre_name)[T.Aintree University Hospitals N...  0.30   2.08  0.79\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]           0.24   8.08  1.40\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Found...  0.36   2.71  0.99\n",
      "C(centre_name)[T.Heart of England NHS Foundatio...  0.33   2.29  0.87\n",
      "C(centre_name)[T.Imperial College Healthcare NH...  0.09   0.62  0.23\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS T...  3.09  85.98 16.30\n",
      "C(centre_name)[T.Morriston Hospital]                0.30   3.24  0.99\n",
      "C(centre_name)[T.Norfolk and Norwich University...  0.54  20.30  3.32\n",
      "C(centre_name)[T.North Bristol NHS Trust]           0.27   2.05  0.75\n",
      "C(centre_name)[T.Nottingham University Hospital...  0.30   2.14  0.80\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundati...  0.30   2.14  0.80\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]    0.15   2.62  0.62\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Fou...  0.26   2.48  0.81\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]      0.24   2.35  0.74\n",
      "C(centre_name)[T.Southampton University Hospita...  0.29   1.96  0.76\n",
      "C(centre_name)[T.Taunton and Somerset NHS Found...  0.29   2.76  0.89\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospit...  0.34   2.97  1.00\n",
      "C(centre_name)[T.University Hospital of South M...  0.43   3.02  1.13\n",
      "C(centre_name)[T.University Hospitals Birmingha...  1.73 143.07 15.74\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals...  0.14   1.97  0.52\n",
      "age                                                 1.02   1.06  1.04\n",
      "ever_smoked                                         1.03   1.91  1.41\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked + C(centre_name)', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.688784\n",
      "         Iterations 4\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      956\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.005684\n",
      "Time:                        18:42:07   Log-Likelihood:                -661.23\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                   0.05604\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                 2.236e-16      0.186    1.2e-15      1.000      -0.364       0.364\n",
      "ever_smoked                 -0.0628      0.229     -0.274      0.784      -0.512       0.386\n",
      "peto_exposed                -0.2513      0.250     -1.004      0.316      -0.742       0.239\n",
      "ever_smoked:peto_exposed     0.5373      0.299      1.799      0.072      -0.048       1.122\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.69   1.44 1.00\n",
      "ever_smoked               0.60   1.47 0.94\n",
      "peto_exposed              0.48   1.27 0.78\n",
      "ever_smoked:peto_exposed  0.95   3.07 1.71\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ ever_smoked*peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.629996\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  960\n",
      "Model:                          Logit   Df Residuals:                      935\n",
      "Method:                           MLE   Df Model:                           24\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.09055\n",
      "Time:                        18:42:09   Log-Likelihood:                -604.80\n",
      "converged:                       True   LL-Null:                       -665.01\n",
      "Covariance Type:            nonrobust   LLR p-value:                 8.124e-15\n",
      "===================================================================================================================================================\n",
      "                                                                                      coef    std err          z      P>|z|      [0.025      0.975]\n",
      "---------------------------------------------------------------------------------------------------------------------------------------------------\n",
      "Intercept                                                                          -3.0969      0.789     -3.928      0.000      -4.642      -1.551\n",
      "C(centre_name)[T.Aintree University Hospitals NHS Foundation Trust]                -0.2009      0.496     -0.405      0.686      -1.174       0.772\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]                                           0.4814      0.901      0.534      0.593      -1.284       2.247\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Foundation Trust]                         0.0454      0.517      0.088      0.930      -0.967       1.058\n",
      "C(centre_name)[T.Heart of England NHS Foundation Trust]                            -0.0932      0.494     -0.189      0.850      -1.061       0.874\n",
      "C(centre_name)[T.Imperial College Healthcare NHS Trust]                            -1.4030      0.506     -2.772      0.006      -2.395      -0.411\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS Trust]                                2.8188      0.848      3.322      0.001       1.156       4.482\n",
      "C(centre_name)[T.Morriston Hospital]                                                0.0153      0.604      0.025      0.980      -1.168       1.198\n",
      "C(centre_name)[T.Norfolk and Norwich University Hospitals NHS Foundation Trust]     1.1958      0.924      1.294      0.196      -0.615       3.007\n",
      "C(centre_name)[T.North Bristol NHS Trust]                                          -0.2484      0.515     -0.483      0.629      -1.257       0.760\n",
      "C(centre_name)[T.Nottingham University Hospitals NHS Trust]                        -0.1411      0.502     -0.281      0.779      -1.125       0.843\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundation Trust]                           -0.1551      0.501     -0.310      0.757      -1.137       0.827\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]                                   -0.3924      0.736     -0.533      0.594      -1.834       1.049\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Foundation Trust]                      -0.1423      0.575     -0.247      0.805      -1.270       0.985\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]                                     -0.2344      0.588     -0.399      0.690      -1.387       0.918\n",
      "C(centre_name)[T.Southampton University Hospitals NHS Trust]                       -0.2419      0.484     -0.499      0.617      -1.191       0.707\n",
      "C(centre_name)[T.Taunton and Somerset NHS Foundation Trust]                        -0.0710      0.578     -0.123      0.902      -1.204       1.062\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospitals NHS Foundation Trust]            0.0117      0.554      0.021      0.983      -1.074       1.098\n",
      "C(centre_name)[T.University Hospital of South Manchester]                           0.1869      0.502      0.373      0.709      -0.796       1.170\n",
      "C(centre_name)[T.University Hospitals Birmingham NHS Foundation Trust]              2.8145      1.128      2.495      0.013       0.604       5.025\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals NHS Trust]                         -0.5971      0.678     -0.881      0.378      -1.926       0.732\n",
      "age                                                                                 0.0419      0.009      4.708      0.000       0.024       0.059\n",
      "ever_smoked                                                                         0.0299      0.248      0.121      0.904      -0.456       0.516\n",
      "peto_exposed                                                                       -0.2690      0.270     -0.996      0.319      -0.798       0.260\n",
      "ever_smoked:peto_exposed                                                            0.5055      0.322      1.571      0.116      -0.125       1.136\n",
      "===================================================================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                                    2.5%  97.5%    OR\n",
      "Intercept                                           0.01   0.21  0.05\n",
      "C(centre_name)[T.Aintree University Hospitals N...  0.31   2.16  0.82\n",
      "C(centre_name)[T.Glasgow Royal Infirmary]           0.28   9.46  1.62\n",
      "C(centre_name)[T.Guys’ and St Thomas’ NHS Found...  0.38   2.88  1.05\n",
      "C(centre_name)[T.Heart of England NHS Foundatio...  0.35   2.40  0.91\n",
      "C(centre_name)[T.Imperial College Healthcare NH...  0.09   0.66  0.25\n",
      "C(centre_name)[T.Leeds Teaching Hospitals NHS T...  3.18  88.39 16.76\n",
      "C(centre_name)[T.Morriston Hospital]                0.31   3.31  1.02\n",
      "C(centre_name)[T.Norfolk and Norwich University...  0.54  20.22  3.31\n",
      "C(centre_name)[T.North Bristol NHS Trust]           0.28   2.14  0.78\n",
      "C(centre_name)[T.Nottingham University Hospital...  0.32   2.32  0.87\n",
      "C(centre_name)[T.Papworth Hospital NHS Foundati...  0.32   2.29  0.86\n",
      "C(centre_name)[T.Portsmouth Hospitals NHS Trust]    0.16   2.86  0.68\n",
      "C(centre_name)[T.Royal Devon and Exeter NHS Fou...  0.28   2.68  0.87\n",
      "C(centre_name)[T.Royal Infirmary of Edinburgh]      0.25   2.50  0.79\n",
      "C(centre_name)[T.Southampton University Hospita...  0.30   2.03  0.79\n",
      "C(centre_name)[T.Taunton and Somerset NHS Found...  0.30   2.89  0.93\n",
      "C(centre_name)[T.The Newcastle Upon Tyne Hospit...  0.34   3.00  1.01\n",
      "C(centre_name)[T.University Hospital of South M...  0.45   3.22  1.21\n",
      "C(centre_name)[T.University Hospitals Birmingha...  1.83 152.17 16.68\n",
      "C(centre_name)[T.Worcestershire Acute Hospitals...  0.15   2.08  0.55\n",
      "age                                                 1.02   1.06  1.04\n",
      "ever_smoked                                         0.63   1.67  1.03\n",
      "peto_exposed                                        0.45   1.30  0.76\n",
      "ever_smoked:peto_exposed                            0.88   3.11  1.66\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed + C(centre_name)', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# table four"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/pandas/core/indexing.py:376: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self.obj[key] = _infer_fill_value(value)\n",
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/pandas/core/indexing.py:494: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self.obj[item] = s\n"
     ]
    }
   ],
   "source": [
    "gt = pd.read_csv('genotyping_cleaned.csv')\n",
    "\n",
    "transforms = {'Heterozygous':'(G;T)', 'Homozygous Allele 1':'(G;G)', 'Homozygous Allele 2':'(T;T)',\n",
    "       'Homozygous Allele 1 ':'(G;G)', 'Homoxygous Allele 1':'(G;G)',\n",
    "       ' Homozygous Allele 2':'(T;T)', ' Homozygous Allele 1':'(G;G)'}\n",
    "gt.genotype = gt.genotype.map(transforms)\n",
    "\n",
    "transforms = {\"(G;G)\":0, \"(G;T)\":1, \"(T;T)\":2}\n",
    "\n",
    "gt.genotype = gt.genotype.map(transforms) # change genotype to 0,1,2 (assumes additive model)\n",
    "\n",
    "# add genotype\n",
    "genotype_lookup = gt[['participant_id', 'genotype']]\n",
    "genotype_lookup.index = genotype_lookup['participant_id']\n",
    "genotype_lookup = genotype_lookup['genotype'].to_dict()\n",
    "\n",
    "df['genotype'] = df.participant_id.map(genotype_lookup)\n",
    "\n",
    "df = df[df['genotype'].notnull()] # restrict analysis to participants with exposure data and genotype\n",
    "\n",
    "df.loc[:,'minor_allele'] = df['genotype'] > 0\n",
    "\n",
    "df.loc[:,'minor_allele'] = df.loc[:,'minor_allele'].astype(int)\n",
    "\n",
    "df.to_csv('flat_data_genotype_subset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.688106\n",
      "         Iterations 4\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  818\n",
      "Model:                          Logit   Df Residuals:                      814\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.006433\n",
      "Time:                        18:50:00   Log-Likelihood:                -562.87\n",
      "converged:                       True   LL-Null:                       -566.52\n",
      "Covariance Type:            nonrobust   LLR p-value:                   0.06325\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -0.2091      0.205     -1.019      0.308      -0.611       0.193\n",
      "ever_smoked                  0.0483      0.252      0.192      0.848      -0.446       0.543\n",
      "peto_exposed                -0.1773      0.272     -0.653      0.514      -0.710       0.355\n",
      "ever_smoked:peto_exposed     0.4471      0.324      1.378      0.168      -0.189       1.083\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.54   1.21 0.81\n",
      "ever_smoked               0.64   1.72 1.05\n",
      "peto_exposed              0.49   1.43 0.84\n",
      "ever_smoked:peto_exposed  0.83   2.95 1.56\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ ever_smoked*peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.681665\n",
      "         Iterations 4\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  818\n",
      "Model:                          Logit   Df Residuals:                      813\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.01573\n",
      "Time:                        18:50:01   Log-Likelihood:                -557.60\n",
      "converged:                       True   LL-Null:                       -566.52\n",
      "Covariance Type:            nonrobust   LLR p-value:                  0.001335\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -2.3285      0.694     -3.356      0.001      -3.689      -0.969\n",
      "age                          0.0279      0.009      3.206      0.001       0.011       0.045\n",
      "ever_smoked                  0.0716      0.254      0.281      0.778      -0.427       0.570\n",
      "peto_exposed                -0.1958      0.274     -0.715      0.474      -0.732       0.341\n",
      "ever_smoked:peto_exposed     0.4725      0.327      1.446      0.148      -0.168       1.113\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.03   0.38 0.10\n",
      "age                       1.01   1.05 1.03\n",
      "ever_smoked               0.65   1.77 1.07\n",
      "peto_exposed              0.48   1.41 0.82\n",
      "ever_smoked:peto_exposed  0.85   3.04 1.60\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.608986\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  818\n",
      "Model:                          Logit   Df Residuals:                      814\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1207\n",
      "Time:                        18:50:02   Log-Likelihood:                -498.15\n",
      "converged:                       True   LL-Null:                       -566.52\n",
      "Covariance Type:            nonrobust   LLR p-value:                 1.918e-29\n",
      "=========================================================================================\n",
      "                            coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-----------------------------------------------------------------------------------------\n",
      "Intercept                -0.7736      0.166     -4.663      0.000      -1.099      -0.448\n",
      "peto_exposed              0.0394      0.204      0.193      0.847      -0.361       0.440\n",
      "genotype                  1.3115      0.239      5.480      0.000       0.842       1.781\n",
      "peto_exposed:genotype     0.3604      0.302      1.194      0.233      -0.231       0.952\n",
      "=========================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                       2.5%  97.5%   OR\n",
      "Intercept              0.33   0.64 0.46\n",
      "peto_exposed           0.70   1.55 1.04\n",
      "genotype               2.32   5.93 3.71\n",
      "peto_exposed:genotype  0.79   2.59 1.43\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ peto_exposed*genotype', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.601830\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  818\n",
      "Model:                          Logit   Df Residuals:                      812\n",
      "Method:                           MLE   Df Model:                            5\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1310\n",
      "Time:                        18:50:02   Log-Likelihood:                -492.30\n",
      "converged:                       True   LL-Null:                       -566.52\n",
      "Covariance Type:            nonrobust   LLR p-value:                 2.872e-30\n",
      "=========================================================================================\n",
      "                            coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-----------------------------------------------------------------------------------------\n",
      "Intercept                -3.1222      0.753     -4.145      0.000      -4.598      -1.646\n",
      "age                       0.0285      0.009      3.014      0.003       0.010       0.047\n",
      "ever_smoked               0.3028      0.173      1.749      0.080      -0.037       0.642\n",
      "peto_exposed              0.0108      0.206      0.053      0.958      -0.393       0.415\n",
      "genotype                  1.3058      0.242      5.399      0.000       0.832       1.780\n",
      "peto_exposed:genotype     0.3649      0.305      1.198      0.231      -0.232       0.962\n",
      "=========================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                       2.5%  97.5%   OR\n",
      "Intercept              0.01   0.19 0.04\n",
      "age                    1.01   1.05 1.03\n",
      "ever_smoked            0.96   1.90 1.35\n",
      "peto_exposed           0.67   1.51 1.01\n",
      "genotype               2.30   5.93 3.69\n",
      "peto_exposed:genotype  0.79   2.62 1.44\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked + peto_exposed*genotype', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.623366\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  479\n",
      "Model:                          Logit   Df Residuals:                      475\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.002356\n",
      "Time:                        18:50:03   Log-Likelihood:                -298.59\n",
      "converged:                       True   LL-Null:                       -299.30\n",
      "Covariance Type:            nonrobust   LLR p-value:                    0.7031\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -1.0531      0.300     -3.512      0.000      -1.641      -0.465\n",
      "ever_smoked                  0.3882      0.363      1.068      0.285      -0.324       1.100\n",
      "peto_exposed                 0.2116      0.384      0.552      0.581      -0.540       0.963\n",
      "ever_smoked:peto_exposed    -0.2657      0.457     -0.582      0.561      -1.161       0.630\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.19   0.63 0.35\n",
      "ever_smoked               0.72   3.00 1.47\n",
      "peto_exposed              0.58   2.62 1.24\n",
      "ever_smoked:peto_exposed  0.31   1.88 0.77\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ ever_smoked*peto_exposed', data=df[df['genotype'] == 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.620634\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  479\n",
      "Model:                          Logit   Df Residuals:                      474\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                0.006730\n",
      "Time:                        18:50:03   Log-Likelihood:                -297.28\n",
      "converged:                       True   LL-Null:                       -299.30\n",
      "Covariance Type:            nonrobust   LLR p-value:                    0.4022\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -2.4977      0.954     -2.619      0.009      -4.367      -0.628\n",
      "age                          0.0193      0.012      1.604      0.109      -0.004       0.043\n",
      "ever_smoked                  0.3719      0.364      1.021      0.307      -0.342       1.086\n",
      "peto_exposed                 0.1741      0.385      0.452      0.651      -0.581       0.929\n",
      "ever_smoked:peto_exposed    -0.2138      0.459     -0.466      0.641      -1.114       0.686\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.01   0.53 0.08\n",
      "age                       1.00   1.04 1.02\n",
      "ever_smoked               0.71   2.96 1.45\n",
      "peto_exposed              0.56   2.53 1.19\n",
      "ever_smoked:peto_exposed  0.33   1.99 0.81\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.576083\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  339\n",
      "Model:                          Logit   Df Residuals:                      335\n",
      "Method:                           MLE   Df Model:                            3\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.03332\n",
      "Time:                        18:50:04   Log-Likelihood:                -195.29\n",
      "converged:                       True   LL-Null:                       -202.02\n",
      "Covariance Type:            nonrobust   LLR p-value:                  0.003738\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                    1.0296      0.368      2.795      0.005       0.308       1.752\n",
      "ever_smoked                 -0.5516      0.434     -1.272      0.203      -1.401       0.298\n",
      "peto_exposed                -0.6931      0.471     -1.473      0.141      -1.615       0.229\n",
      "ever_smoked:peto_exposed     1.5797      0.556      2.839      0.005       0.489       2.670\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 1.36   5.76 2.80\n",
      "ever_smoked               0.25   1.35 0.58\n",
      "peto_exposed              0.20   1.26 0.50\n",
      "ever_smoked:peto_exposed  1.63  14.44 4.85\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ ever_smoked*peto_exposed', data=df[df['genotype'] > 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.565499\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  339\n",
      "Model:                          Logit   Df Residuals:                      334\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.05108\n",
      "Time:                        18:50:04   Log-Likelihood:                -191.70\n",
      "converged:                       True   LL-Null:                       -202.02\n",
      "Covariance Type:            nonrobust   LLR p-value:                 0.0003737\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -2.1013      1.239     -1.696      0.090      -4.529       0.327\n",
      "age                          0.0405      0.015      2.628      0.009       0.010       0.071\n",
      "ever_smoked                 -0.4274      0.441     -0.969      0.333      -1.292       0.437\n",
      "peto_exposed                -0.6464      0.477     -1.354      0.176      -1.582       0.289\n",
      "ever_smoked:peto_exposed     1.5147      0.564      2.686      0.007       0.409       2.620\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.01   1.39 0.12\n",
      "age                       1.01   1.07 1.04\n",
      "ever_smoked               0.27   1.55 0.65\n",
      "peto_exposed              0.21   1.34 0.52\n",
      "ever_smoked:peto_exposed  1.51  13.73 4.55\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] > 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
       "       'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
       "       'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
       "       'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
       "       'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
       "       'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
       "       'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
       "       'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
       "       'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
       "       'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
       "       'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
       "       'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
       "       'Aberdeen Royal Infirmary',\n",
       "       'Aintree University Hospitals NHS Foundation Trust',\n",
       "       'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
       "       'Heart of England NHS Foundation Trust',\n",
       "       'Imperial College Healthcare NHS Trust',\n",
       "       'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
       "       'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
       "       'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
       "       'Papworth Hospital NHS Foundation Trust',\n",
       "       'Portsmouth Hospitals NHS Trust',\n",
       "       'Royal Devon and Exeter NHS Foundation Trust',\n",
       "       'Royal Infirmary of Edinburgh',\n",
       "       'Southampton University Hospitals NHS Trust',\n",
       "       'Taunton and Somerset NHS Foundation Trust',\n",
       "       'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
       "       'University Hospital of South Manchester',\n",
       "       'University Hospitals Birmingham NHS Foundation Trust',\n",
       "       'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
       "       'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
       "       'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
       "       'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
       "       'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
       "       'minor_allele'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[['ipfjes_id','case','minor_allele','genotype', 'peto_exposed','peto_dose','lowest_peto_cat_reordered', 'fibre_ml_exposure','ever_smoked', 'packyrs','age', 'ethnicity', 'ethcat', 'centre']].to_csv('cosseta.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# sensitivity analysis "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = df[df['centre'] != 2] # throw away centre 2 since it contains no genotyped cases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.623254\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  808\n",
      "Model:                          Logit   Df Residuals:                      785\n",
      "Method:                           MLE   Df Model:                           22\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1005\n",
      "Time:                        18:50:07   Log-Likelihood:                -503.59\n",
      "converged:                       True   LL-Null:                       -559.86\n",
      "Covariance Type:            nonrobust   LLR p-value:                 3.864e-14\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -0.1533      0.320     -0.478      0.632      -0.781       0.475\n",
      "C(centre)[T.3.0]            -1.1856      0.449     -2.642      0.008      -2.065      -0.306\n",
      "C(centre)[T.4.0]            -0.0720      0.310     -0.232      0.816      -0.680       0.536\n",
      "C(centre)[T.5.0]             0.2419      0.340      0.711      0.477      -0.425       0.909\n",
      "C(centre)[T.6.0]             0.0016      0.342      0.005      0.996      -0.668       0.671\n",
      "C(centre)[T.7.0]            -0.1405      0.429     -0.327      0.744      -0.982       0.701\n",
      "C(centre)[T.8.0]            -0.0454      0.329     -0.138      0.890      -0.691       0.600\n",
      "C(centre)[T.9.0]            -1.4948      0.512     -2.922      0.003      -2.498      -0.492\n",
      "C(centre)[T.10.0]           -1.2839      0.350     -3.664      0.000      -1.971      -0.597\n",
      "C(centre)[T.11.0]           -0.0920      0.489     -0.188      0.851      -1.050       0.866\n",
      "C(centre)[T.12.0]            0.2417      0.803      0.301      0.763      -1.332       1.816\n",
      "C(centre)[T.13.0]           -0.3763      0.500     -0.753      0.452      -1.356       0.604\n",
      "C(centre)[T.14.0]            0.1482      0.418      0.354      0.723      -0.671       0.968\n",
      "C(centre)[T.15.0]            0.0146      0.440      0.033      0.974      -0.847       0.876\n",
      "C(centre)[T.16.0]            3.4126      1.042      3.275      0.001       1.370       5.455\n",
      "C(centre)[T.17.0]           -0.2516      0.651     -0.386      0.699      -1.528       1.025\n",
      "C(centre)[T.18.0]            2.5925      1.066      2.432      0.015       0.503       4.682\n",
      "C(centre)[T.19.0]           -0.2381      0.569     -0.418      0.676      -1.354       0.878\n",
      "C(centre)[T.20.0]            0.0497      0.364      0.136      0.891      -0.664       0.764\n",
      "C(centre)[T.21.0]            1.6392      1.108      1.480      0.139      -0.532       3.810\n",
      "ever_smoked                  0.1785      0.277      0.645      0.519      -0.364       0.721\n",
      "peto_exposed                -0.0675      0.297     -0.227      0.820      -0.649       0.514\n",
      "ever_smoked:peto_exposed     0.2676      0.354      0.756      0.450      -0.427       0.962\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%    OR\n",
      "Intercept                 0.46   1.61  0.86\n",
      "C(centre)[T.3.0]          0.13   0.74  0.31\n",
      "C(centre)[T.4.0]          0.51   1.71  0.93\n",
      "C(centre)[T.5.0]          0.65   2.48  1.27\n",
      "C(centre)[T.6.0]          0.51   1.96  1.00\n",
      "C(centre)[T.7.0]          0.37   2.02  0.87\n",
      "C(centre)[T.8.0]          0.50   1.82  0.96\n",
      "C(centre)[T.9.0]          0.08   0.61  0.22\n",
      "C(centre)[T.10.0]         0.14   0.55  0.28\n",
      "C(centre)[T.11.0]         0.35   2.38  0.91\n",
      "C(centre)[T.12.0]         0.26   6.15  1.27\n",
      "C(centre)[T.13.0]         0.26   1.83  0.69\n",
      "C(centre)[T.14.0]         0.51   2.63  1.16\n",
      "C(centre)[T.15.0]         0.43   2.40  1.01\n",
      "C(centre)[T.16.0]         3.94 233.96 30.34\n",
      "C(centre)[T.17.0]         0.22   2.79  0.78\n",
      "C(centre)[T.18.0]         1.65 107.98 13.36\n",
      "C(centre)[T.19.0]         0.26   2.41  0.79\n",
      "C(centre)[T.20.0]         0.51   2.15  1.05\n",
      "C(centre)[T.21.0]         0.59  45.15  5.15\n",
      "ever_smoked               0.70   2.06  1.20\n",
      "peto_exposed              0.52   1.67  0.93\n",
      "ever_smoked:peto_exposed  0.65   2.62  1.31\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ C(centre) + ever_smoked*peto_exposed', data=df)              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Maximum number of iterations has been exceeded.\n",
      "         Current function value: 0.552271\n",
      "         Iterations: 35\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  470\n",
      "Model:                          Logit   Df Residuals:                      447\n",
      "Method:                           MLE   Df Model:                           22\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1226\n",
      "Time:                        18:50:07   Log-Likelihood:                -259.57\n",
      "converged:                      False   LL-Null:                       -295.82\n",
      "Covariance Type:            nonrobust   LLR p-value:                 2.649e-07\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -0.5072      0.450     -1.128      0.259      -1.388       0.374\n",
      "C(centre)[T.3.0]            -3.0007      1.071     -2.801      0.005      -5.100      -0.901\n",
      "C(centre)[T.4.0]            -0.4225      0.429     -0.985      0.325      -1.263       0.418\n",
      "C(centre)[T.5.0]            -0.3755      0.469     -0.800      0.424      -1.296       0.545\n",
      "C(centre)[T.6.0]            -0.8764      0.482     -1.816      0.069      -1.822       0.069\n",
      "C(centre)[T.7.0]            -0.5951      0.585     -1.018      0.309      -1.741       0.551\n",
      "C(centre)[T.8.0]            -0.3221      0.444     -0.725      0.468      -1.193       0.548\n",
      "C(centre)[T.9.0]            -1.5425      0.700     -2.205      0.027      -2.914      -0.171\n",
      "C(centre)[T.10.0]           -1.5149      0.464     -3.266      0.001      -2.424      -0.606\n",
      "C(centre)[T.11.0]           -0.2154      0.653     -0.330      0.741      -1.495       1.064\n",
      "C(centre)[T.12.0]            0.1198      1.051      0.114      0.909      -1.941       2.180\n",
      "C(centre)[T.13.0]           -0.3851      0.640     -0.602      0.547      -1.639       0.869\n",
      "C(centre)[T.14.0]           -0.0364      0.593     -0.061      0.951      -1.199       1.126\n",
      "C(centre)[T.15.0]           -0.6553      0.675     -0.971      0.331      -1.978       0.667\n",
      "C(centre)[T.16.0]            2.9878      1.078      2.771      0.006       0.875       5.101\n",
      "C(centre)[T.17.0]           -0.9279      0.875     -1.061      0.289      -2.642       0.786\n",
      "C(centre)[T.18.0]           25.6231   1.93e+05      0.000      1.000   -3.79e+05    3.79e+05\n",
      "C(centre)[T.19.0]           -1.7448      1.114     -1.566      0.117      -3.929       0.439\n",
      "C(centre)[T.20.0]           -0.5751      0.514     -1.118      0.264      -1.584       0.433\n",
      "C(centre)[T.21.0]            0.8732      1.266      0.690      0.490      -1.607       3.354\n",
      "ever_smoked                  0.4247      0.403      1.053      0.292      -0.366       1.215\n",
      "peto_exposed                 0.4169      0.417      1.000      0.317      -0.400       1.234\n",
      "ever_smoked:peto_exposed    -0.5584      0.502     -1.113      0.266      -1.542       0.425\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%                 OR\n",
      "Intercept                 0.25   1.45               0.60\n",
      "C(centre)[T.3.0]          0.01   0.41               0.05\n",
      "C(centre)[T.4.0]          0.28   1.52               0.66\n",
      "C(centre)[T.5.0]          0.27   1.72               0.69\n",
      "C(centre)[T.6.0]          0.16   1.07               0.42\n",
      "C(centre)[T.7.0]          0.18   1.74               0.55\n",
      "C(centre)[T.8.0]          0.30   1.73               0.72\n",
      "C(centre)[T.9.0]          0.05   0.84               0.21\n",
      "C(centre)[T.10.0]         0.09   0.55               0.22\n",
      "C(centre)[T.11.0]         0.22   2.90               0.81\n",
      "C(centre)[T.12.0]         0.14   8.85               1.13\n",
      "C(centre)[T.13.0]         0.19   2.38               0.68\n",
      "C(centre)[T.14.0]         0.30   3.08               0.96\n",
      "C(centre)[T.15.0]         0.14   1.95               0.52\n",
      "C(centre)[T.16.0]         2.40 164.15              19.84\n",
      "C(centre)[T.17.0]         0.07   2.19               0.40\n",
      "C(centre)[T.18.0]         0.00    inf 134,265,754,158.55\n",
      "C(centre)[T.19.0]         0.02   1.55               0.17\n",
      "C(centre)[T.20.0]         0.21   1.54               0.56\n",
      "C(centre)[T.21.0]         0.20  28.61               2.39\n",
      "ever_smoked               0.69   3.37               1.53\n",
      "peto_exposed              0.67   3.43               1.52\n",
      "ever_smoked:peto_exposed  0.21   1.53               0.57\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ C(centre) + ever_smoked*peto_exposed', data=df[df['genotype'] == 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'warnflag': 1, 'converged': False, 'fopt': 0.5522708783537755, 'iterations': 35, 'Hessian': array([[-1.87391996e-01, -2.03838319e-03, -2.51523230e-02,\n",
      "        -1.75842094e-02, -1.57700251e-02, -8.71592841e-03,\n",
      "        -2.13369199e-02, -5.41636587e-03, -1.80127868e-02,\n",
      "        -6.51389565e-03, -2.12573330e-03, -6.82256037e-03,\n",
      "        -8.34569887e-03, -6.04801705e-03, -2.00045465e-03,\n",
      "        -3.18873754e-03, -5.68561746e-14, -1.85863557e-03,\n",
      "        -1.29700506e-02, -1.41720838e-03, -1.34309334e-01,\n",
      "        -1.24716615e-01, -9.23409703e-02],\n",
      "       [-2.03838319e-03, -2.03838319e-03, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.09343646e-03,\n",
      "        -1.35444644e-03, -4.69589213e-04],\n",
      "       [-2.51523230e-02, -0.00000000e+00, -2.51523230e-02,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.50543249e-02,\n",
      "        -1.50455575e-02, -1.05597900e-02],\n",
      "       [-1.75842094e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -1.75842094e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.04050687e-02,\n",
      "        -9.89202132e-03, -5.35536179e-03],\n",
      "       [-1.57700251e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -1.57700251e-02, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.06672985e-02,\n",
      "        -9.37671814e-03, -5.97893227e-03],\n",
      "       [-8.71592841e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -8.71592841e-03,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -7.36965281e-03,\n",
      "        -5.46619728e-03, -4.51813184e-03],\n",
      "       [-2.13369199e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -2.13369199e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.68674526e-02,\n",
      "        -1.83939952e-02, -1.48245412e-02],\n",
      "       [-5.41636587e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -5.41636587e-03, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -3.82255379e-03,\n",
      "        -3.81640773e-03, -2.65269556e-03],\n",
      "       [-1.80127868e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.80127868e-02,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.28528356e-02,\n",
      "        -1.13479195e-02, -8.38450647e-03],\n",
      "       [-6.51389565e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -6.51389565e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -4.58990461e-03,\n",
      "        -4.06900699e-03, -3.54933022e-03],\n",
      "       [-2.12573330e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -2.12573330e-03, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.59393390e-03,\n",
      "        -1.06227367e-03, -5.30474271e-04],\n",
      "       [-6.82256037e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -6.82256037e-03,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -5.88094914e-03,\n",
      "        -4.87229639e-03, -4.36934009e-03],\n",
      "       [-8.34569887e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -8.34569887e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -6.29715782e-03,\n",
      "        -5.76660723e-03, -4.70703381e-03],\n",
      "       [-6.04801705e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -6.04801705e-03, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -2.64624129e-03,\n",
      "        -4.50358625e-03, -2.64624129e-03],\n",
      "       [-2.00045465e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -2.00045465e-03,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.59178058e-03,\n",
      "        -1.17380115e-03, -1.06838426e-03],\n",
      "       [-3.18873754e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -3.18873754e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -2.77393858e-03,\n",
      "        -2.35609273e-03, -1.94129377e-03],\n",
      "       [-5.68561746e-14, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -5.68561746e-14, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -5.68561746e-14,\n",
      "        -3.96477177e-14, -3.96477177e-14],\n",
      "       [-1.85863557e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.85863557e-03,\n",
      "        -0.00000000e+00, -0.00000000e+00, -1.42288288e-03,\n",
      "        -1.16748592e-03, -9.14962486e-04],\n",
      "       [-1.29700506e-02, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -1.29700506e-02, -0.00000000e+00, -1.20907483e-02,\n",
      "        -7.30640273e-03, -6.82928294e-03],\n",
      "       [-1.41720838e-03, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,\n",
      "        -0.00000000e+00, -1.41720838e-03, -9.59146699e-04,\n",
      "        -1.41720838e-03, -9.59146699e-04],\n",
      "       [-1.34309334e-01, -1.09343646e-03, -1.50543249e-02,\n",
      "        -1.04050687e-02, -1.06672985e-02, -7.36965281e-03,\n",
      "        -1.68674526e-02, -3.82255379e-03, -1.28528356e-02,\n",
      "        -4.58990461e-03, -1.59393390e-03, -5.88094914e-03,\n",
      "        -6.29715782e-03, -2.64624129e-03, -1.59178058e-03,\n",
      "        -2.77393858e-03, -5.68561746e-14, -1.42288288e-03,\n",
      "        -1.20907483e-02, -9.59146699e-04, -1.34309334e-01,\n",
      "        -9.23409703e-02, -9.23409703e-02],\n",
      "       [-1.24716615e-01, -1.35444644e-03, -1.50455575e-02,\n",
      "        -9.89202132e-03, -9.37671814e-03, -5.46619728e-03,\n",
      "        -1.83939952e-02, -3.81640773e-03, -1.13479195e-02,\n",
      "        -4.06900699e-03, -1.06227367e-03, -4.87229639e-03,\n",
      "        -5.76660723e-03, -4.50358625e-03, -1.17380115e-03,\n",
      "        -2.35609273e-03, -3.96477177e-14, -1.16748592e-03,\n",
      "        -7.30640273e-03, -1.41720838e-03, -9.23409703e-02,\n",
      "        -1.24716615e-01, -9.23409703e-02],\n",
      "       [-9.23409703e-02, -4.69589213e-04, -1.05597900e-02,\n",
      "        -5.35536179e-03, -5.97893227e-03, -4.51813184e-03,\n",
      "        -1.48245412e-02, -2.65269556e-03, -8.38450647e-03,\n",
      "        -3.54933022e-03, -5.30474271e-04, -4.36934009e-03,\n",
      "        -4.70703381e-03, -2.64624129e-03, -1.06838426e-03,\n",
      "        -1.94129377e-03, -3.96477177e-14, -9.14962486e-04,\n",
      "        -6.82928294e-03, -9.59146699e-04, -9.23409703e-02,\n",
      "        -9.23409703e-02, -9.23409703e-02]]), 'score': array([ 1.70076719e-17,  4.72435330e-19,  0.00000000e+00,  2.71650315e-18,\n",
      "       -1.88974132e-18,  1.18108832e-18,  2.36217665e-18,  9.44870659e-19,\n",
      "        1.41730599e-18,  0.00000000e+00, -2.36217665e-19,  1.41730599e-18,\n",
      "        1.88974132e-18,  8.26761827e-19,  4.96057096e-18,  5.90544162e-19,\n",
      "        5.68561746e-14,  0.00000000e+00, -9.44870659e-19,  0.00000000e+00,\n",
      "        9.44870659e-18,  1.70076719e-17,  8.50383593e-18])}\n"
     ]
    }
   ],
   "source": [
    "print(result.mle_retvals)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Maximum number of iterations has been exceeded.\n",
      "         Current function value: 0.497555\n",
      "         Iterations: 35\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  338\n",
      "Model:                          Logit   Df Residuals:                      314\n",
      "Method:                           MLE   Df Model:                           23\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1623\n",
      "Time:                        18:50:07   Log-Likelihood:                -168.17\n",
      "converged:                      False   LL-Null:                       -200.76\n",
      "Covariance Type:            nonrobust   LLR p-value:                 6.619e-06\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -3.4449      1.476     -2.335      0.020      -6.337      -0.553\n",
      "C(centre)[T.3.0]             0.0209      0.677      0.031      0.975      -1.307       1.349\n",
      "C(centre)[T.4.0]             0.3200      0.501      0.639      0.523      -0.661       1.301\n",
      "C(centre)[T.5.0]             1.2380      0.605      2.045      0.041       0.051       2.425\n",
      "C(centre)[T.6.0]             1.8960      0.733      2.587      0.010       0.459       3.333\n",
      "C(centre)[T.7.0]             0.9918      0.812      1.222      0.222      -0.599       2.582\n",
      "C(centre)[T.8.0]             0.6448      0.557      1.158      0.247      -0.447       1.736\n",
      "C(centre)[T.9.0]            -1.2746      0.784     -1.626      0.104      -2.811       0.262\n",
      "C(centre)[T.10.0]           -0.0284      0.635     -0.045      0.964      -1.273       1.217\n",
      "C(centre)[T.11.0]            0.6130      0.854      0.718      0.473      -1.060       2.286\n",
      "C(centre)[T.12.0]            0.6041      1.343      0.450      0.653      -2.027       3.236\n",
      "C(centre)[T.13.0]            0.2125      0.857      0.248      0.804      -1.467       1.892\n",
      "C(centre)[T.14.0]            0.2247      0.635      0.354      0.723      -1.020       1.469\n",
      "C(centre)[T.15.0]            0.7131      0.707      1.009      0.313      -0.672       2.098\n",
      "C(centre)[T.16.0]           23.8352    4.3e+04      0.001      1.000   -8.43e+04    8.43e+04\n",
      "C(centre)[T.17.0]           20.7693   1.91e+04      0.001      0.999   -3.74e+04    3.75e+04\n",
      "C(centre)[T.18.0]            2.5712      1.152      2.232      0.026       0.313       4.829\n",
      "C(centre)[T.19.0]            1.3860      1.173      1.182      0.237      -0.912       3.684\n",
      "C(centre)[T.20.0]            1.0472      0.638      1.642      0.101      -0.203       2.297\n",
      "C(centre)[T.21.0]           21.7733    3.4e+04      0.001      0.999   -6.65e+04    6.66e+04\n",
      "age                          0.0480      0.017      2.785      0.005       0.014       0.082\n",
      "ever_smoked                 -0.2134      0.496     -0.431      0.667      -1.185       0.758\n",
      "peto_exposed                -0.7364      0.537     -1.371      0.170      -1.789       0.316\n",
      "ever_smoked:peto_exposed     1.5718      0.636      2.472      0.013       0.326       2.818\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%                OR\n",
      "Intercept                 0.00   0.58              0.03\n",
      "C(centre)[T.3.0]          0.27   3.85              1.02\n",
      "C(centre)[T.4.0]          0.52   3.67              1.38\n",
      "C(centre)[T.5.0]          1.05  11.30              3.45\n",
      "C(centre)[T.6.0]          1.58  28.01              6.66\n",
      "C(centre)[T.7.0]          0.55  13.23              2.70\n",
      "C(centre)[T.8.0]          0.64   5.68              1.91\n",
      "C(centre)[T.9.0]          0.06   1.30              0.28\n",
      "C(centre)[T.10.0]         0.28   3.38              0.97\n",
      "C(centre)[T.11.0]         0.35   9.84              1.85\n",
      "C(centre)[T.12.0]         0.13  25.42              1.83\n",
      "C(centre)[T.13.0]         0.23   6.63              1.24\n",
      "C(centre)[T.14.0]         0.36   4.35              1.25\n",
      "C(centre)[T.15.0]         0.51   8.15              2.04\n",
      "C(centre)[T.16.0]         0.00    inf 22,463,517,373.90\n",
      "C(centre)[T.17.0]         0.00    inf  1,047,068,906.30\n",
      "C(centre)[T.18.0]         1.37 125.10             13.08\n",
      "C(centre)[T.19.0]         0.40  39.81              4.00\n",
      "C(centre)[T.20.0]         0.82   9.95              2.85\n",
      "C(centre)[T.21.0]         0.00    inf  2,857,809,871.09\n",
      "age                       1.01   1.09              1.05\n",
      "ever_smoked               0.31   2.13              0.81\n",
      "peto_exposed              0.17   1.37              0.48\n",
      "ever_smoked:peto_exposed  1.39  16.74              4.82\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + C(centre) + ever_smoked*peto_exposed', data=df[df['genotype'] > 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.508455\n",
      "         Iterations 7\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  338\n",
      "Model:                          Logit   Df Residuals:                      332\n",
      "Method:                           MLE   Df Model:                            5\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                  0.1440\n",
      "Time:                        18:50:07   Log-Likelihood:                -171.86\n",
      "converged:                       True   LL-Null:                       -200.76\n",
      "Covariance Type:            nonrobust   LLR p-value:                 3.460e-11\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -3.4483      1.326     -2.601      0.009      -6.047      -0.850\n",
      "age                          0.0450      0.016      2.780      0.005       0.013       0.077\n",
      "distfromcentre               0.0422      0.009      4.702      0.000       0.025       0.060\n",
      "ever_smoked                 -0.1933      0.472     -0.409      0.682      -1.118       0.732\n",
      "peto_exposed                -0.3531      0.505     -0.700      0.484      -1.342       0.636\n",
      "ever_smoked:peto_exposed     1.1723      0.598      1.961      0.050       0.001       2.344\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.00   0.43 0.03\n",
      "age                       1.01   1.08 1.05\n",
      "distfromcentre            1.02   1.06 1.04\n",
      "ever_smoked               0.33   2.08 0.82\n",
      "peto_exposed              0.26   1.89 0.70\n",
      "ever_smoked:peto_exposed  1.00  10.42 3.23\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + distfromcentre + ever_smoked*peto_exposed', data=df[df['genotype'] > 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
       "       'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
       "       'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
       "       'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
       "       'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
       "       'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
       "       'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
       "       'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
       "       'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
       "       'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
       "       'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
       "       'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
       "       'Aberdeen Royal Infirmary',\n",
       "       'Aintree University Hospitals NHS Foundation Trust',\n",
       "       'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
       "       'Heart of England NHS Foundation Trust',\n",
       "       'Imperial College Healthcare NHS Trust',\n",
       "       'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
       "       'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
       "       'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
       "       'Papworth Hospital NHS Foundation Trust',\n",
       "       'Portsmouth Hospitals NHS Trust',\n",
       "       'Royal Devon and Exeter NHS Foundation Trust',\n",
       "       'Royal Infirmary of Edinburgh',\n",
       "       'Southampton University Hospitals NHS Trust',\n",
       "       'Taunton and Somerset NHS Foundation Trust',\n",
       "       'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
       "       'University Hospital of South Manchester',\n",
       "       'University Hospitals Birmingham NHS Foundation Trust',\n",
       "       'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
       "       'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
       "       'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
       "       'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
       "       'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
       "       'minor_allele'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.555380\n",
      "         Iterations 6\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  338\n",
      "Model:                          Logit   Df Residuals:                      327\n",
      "Method:                           MLE   Df Model:                           10\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.06495\n",
      "Time:                        18:50:08   Log-Likelihood:                -187.72\n",
      "converged:                       True   LL-Null:                       -200.76\n",
      "Covariance Type:            nonrobust   LLR p-value:                  0.003637\n",
      "=================================================================================================================\n",
      "                                                    coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-----------------------------------------------------------------------------------------------------------------\n",
      "Intercept                                        -2.5496      1.348     -1.891      0.059      -5.193       0.093\n",
      "C(lowest_peto_cat_reordered)[T.2]                 0.2687      0.750      0.358      0.720      -1.202       1.739\n",
      "C(lowest_peto_cat_reordered)[T.3]                -0.5650      0.674     -0.839      0.402      -1.885       0.755\n",
      "C(lowest_peto_cat_reordered)[T.4]                 0.0064      0.767      0.008      0.993      -1.497       1.510\n",
      "C(lowest_peto_cat_reordered)[T.5]                -1.3882      0.946     -1.467      0.142      -3.243       0.467\n",
      "age                                               0.0445      0.016      2.810      0.005       0.013       0.076\n",
      "ever_smoked                                      -0.1762      0.656     -0.269      0.788      -1.461       1.109\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.2]    -0.4357      0.892     -0.489      0.625      -2.183       1.312\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.3]     1.3211      0.832      1.588      0.112      -0.310       2.952\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.4]     1.1288      0.915      1.234      0.217      -0.665       2.922\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.5]     1.6952      1.082      1.567      0.117      -0.425       3.815\n",
      "=================================================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                                               2.5%  97.5%   OR\n",
      "Intercept                                      0.01   1.10 0.08\n",
      "C(lowest_peto_cat_reordered)[T.2]              0.30   5.69 1.31\n",
      "C(lowest_peto_cat_reordered)[T.3]              0.15   2.13 0.57\n",
      "C(lowest_peto_cat_reordered)[T.4]              0.22   4.53 1.01\n",
      "C(lowest_peto_cat_reordered)[T.5]              0.04   1.59 0.25\n",
      "age                                            1.01   1.08 1.05\n",
      "ever_smoked                                    0.23   3.03 0.84\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.2]  0.11   3.71 0.65\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.3]  0.73  19.14 3.75\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.4]  0.51  18.58 3.09\n",
      "ever_smoked:C(lowest_peto_cat_reordered)[T.5]  0.65  45.39 5.45\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*C(lowest_peto_cat_reordered)', data=df[df['genotype'] > 0])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['ipfjes_id', 'case', 'dob', 'age', 'yob', 'agegroup', 'ethnicity',\n",
       "       'ever_smoked', 'current_smoker', 'packyrs', 'participant_id', 'centre',\n",
       "       'gp_coords', 'centre_coords', 'distfromcentre', 'ct', 'bx', 'fhx',\n",
       "       'amiodarone', 'flecainade', 'nitrofurantoin', 'azathioprine',\n",
       "       'gefitinib', 'ifosfamide', 'melphalan', 'rituximab', 'mrc0', 'mrc1',\n",
       "       'mrc2', 'mrc3', 'mrc4', 'pc_sob', 'pc_cough', 'pc_incidental',\n",
       "       'pc_incidental_desc', 'pc_other', 'comments', 'mrc_score',\n",
       "       'peto_exposed', 'exposed_stone', 'exposed_wood', 'exposed_metal',\n",
       "       'exposed_farm', 'exposed_asbestos', 'peto_dose', 'median_ssec',\n",
       "       'fibre_ml_exposure', 'lowest_peto_cat', 'peto_shortlist',\n",
       "       'coggan_shortlist', 'mean_pmr', 'highest_pmr', 'meso_pmr_dose',\n",
       "       'agecat', 'ethcat', 'ctcat', 'bxcat', 'centre_name',\n",
       "       'Aberdeen Royal Infirmary',\n",
       "       'Aintree University Hospitals NHS Foundation Trust',\n",
       "       'Glasgow Royal Infirmary', 'Guys’ and St Thomas’ NHS Foundation Trust',\n",
       "       'Heart of England NHS Foundation Trust',\n",
       "       'Imperial College Healthcare NHS Trust',\n",
       "       'Leeds Teaching Hospitals NHS Trust', 'Morriston Hospital',\n",
       "       'Norfolk and Norwich University Hospitals NHS Foundation Trust',\n",
       "       'North Bristol NHS Trust', 'Nottingham University Hospitals NHS Trust',\n",
       "       'Papworth Hospital NHS Foundation Trust',\n",
       "       'Portsmouth Hospitals NHS Trust',\n",
       "       'Royal Devon and Exeter NHS Foundation Trust',\n",
       "       'Royal Infirmary of Edinburgh',\n",
       "       'Southampton University Hospitals NHS Trust',\n",
       "       'Taunton and Somerset NHS Foundation Trust',\n",
       "       'The Newcastle Upon Tyne Hospitals NHS Foundation Trust',\n",
       "       'University Hospital of South Manchester',\n",
       "       'University Hospitals Birmingham NHS Foundation Trust',\n",
       "       'Worcestershire Acute Hospitals NHS Trust', 'Arab',\n",
       "       'Asian / Asian British', 'Black / African/ Caribbean/ Black British',\n",
       "       'Mixed / Multiple ethnic groups', 'Other ethnic group', 'White',\n",
       "       'definite UIP', 'no CT', 'other', 'possible UIP', 'ever_drug_exposed',\n",
       "       'lowest_peto_cat_reordered', 'median_ssec_int', 'genotype',\n",
       "       'minor_allele'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.572279\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                  302\n",
      "Model:                          Logit   Df Residuals:                      297\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.06056\n",
      "Time:                        18:50:08   Log-Likelihood:                -172.83\n",
      "converged:                       True   LL-Null:                       -183.97\n",
      "Covariance Type:            nonrobust   LLR p-value:                 0.0001760\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                   -2.5108      1.308     -1.919      0.055      -5.075       0.053\n",
      "age                          0.0465      0.016      2.835      0.005       0.014       0.079\n",
      "ever_smoked                 -0.5233      0.462     -1.132      0.258      -1.430       0.383\n",
      "peto_exposed                -0.8041      0.499     -1.613      0.107      -1.781       0.173\n",
      "ever_smoked:peto_exposed     1.6815      0.590      2.850      0.004       0.525       2.838\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%  97.5%   OR\n",
      "Intercept                 0.01   1.05 0.08\n",
      "age                       1.01   1.08 1.05\n",
      "ever_smoked               0.24   1.47 0.59\n",
      "peto_exposed              0.17   1.19 0.45\n",
      "ever_smoked:peto_exposed  1.69  17.08 5.37\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 1])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Maximum number of iterations has been exceeded.\n",
      "         Current function value: 0.371688\n",
      "         Iterations: 35\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:                   case   No. Observations:                   36\n",
      "Model:                          Logit   Df Residuals:                       31\n",
      "Method:                           MLE   Df Model:                            4\n",
      "Date:                Fri, 04 Sep 2020   Pseudo R-squ.:                 0.07756\n",
      "Time:                        18:50:08   Log-Likelihood:                -13.381\n",
      "converged:                      False   LL-Null:                       -14.506\n",
      "Covariance Type:            nonrobust   LLR p-value:                    0.6898\n",
      "============================================================================================\n",
      "                               coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------------------\n",
      "Intercept                    4.2880      5.075      0.845      0.398      -5.658      14.234\n",
      "age                         -0.0439      0.060     -0.735      0.462      -0.161       0.073\n",
      "ever_smoked                  0.7230      1.684      0.429      0.668      -2.578       4.024\n",
      "peto_exposed                22.6988    7.6e+04      0.000      1.000   -1.49e+05    1.49e+05\n",
      "ever_smoked:peto_exposed   -22.4422    7.6e+04     -0.000      1.000   -1.49e+05    1.49e+05\n",
      "============================================================================================\n",
      "Odds Ratios\n",
      "======================\n",
      "                          2.5%        97.5%               OR\n",
      "Intercept                 0.00 1,519,826.23            72.82\n",
      "age                       0.85         1.08             0.96\n",
      "ever_smoked               0.08        55.92             2.06\n",
      "peto_exposed              0.00          inf 7,210,463,004.56\n",
      "ever_smoked:peto_exposed  0.00          inf             0.00\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  \"Check mle_retvals\", ConvergenceWarning)\n",
      "/home/drcjar/anaconda3/envs/ipfjes/lib/python3.5/site-packages/ipykernel_launcher.py:10: RuntimeWarning: overflow encountered in exp\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    }
   ],
   "source": [
    "model = smf.logit('case ~ age + ever_smoked*peto_exposed', data=df[df['genotype'] == 2])              \n",
    "result = model.fit()\n",
    "print(result.summary())\n",
    "print (\"Odds Ratios\")\n",
    "print (\"======================\")\n",
    "params = result.params\n",
    "conf = result.conf_int()\n",
    "conf['OR'] = params\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "print(np.exp(conf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>case</th>\n",
       "      <th>age</th>\n",
       "      <th>ever_smoked</th>\n",
       "      <th>peto_exposed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>case</th>\n",
       "      <td>1.00</td>\n",
       "      <td>-0.15</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>-0.15</td>\n",
       "      <td>1.00</td>\n",
       "      <td>-0.18</td>\n",
       "      <td>-0.19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ever_smoked</th>\n",
       "      <td>0.04</td>\n",
       "      <td>-0.18</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>peto_exposed</th>\n",
       "      <td>0.14</td>\n",
       "      <td>-0.19</td>\n",
       "      <td>0.26</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              case   age  ever_smoked  peto_exposed\n",
       "case          1.00 -0.15         0.04          0.14\n",
       "age          -0.15  1.00        -0.18         -0.19\n",
       "ever_smoked   0.04 -0.18         1.00          0.26\n",
       "peto_exposed  0.14 -0.19         0.26          1.00"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['genotype'] == 2][['case', 'age', 'ever_smoked', 'peto_exposed']].corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
No results found