apop_tests.c 25.4 KB
Newer Older
1 2

/** \file apop_tests.c	 */
3
/* Copyright (c) 2007 by Ben Klemens.  Licensed under the GPLv2; see COPYING.  
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
 */
#include "apop_internal.h"

static apop_data * produce_t_test_output(int df, double stat, double diff){
    double pval, qval, two_tail;
    if (!gsl_isnan(stat)){
        pval    = gsl_cdf_tdist_P(stat, df);
        qval    = 1-pval;
        two_tail= 2*GSL_MIN(fabs(pval-.5),fabs(qval-0.5));
    } else {
        pval    = GSL_NAN;
        qval    = GSL_NAN;
        two_tail= GSL_NAN;
    }
    apop_data *out = apop_data_alloc();
    apop_data_add_named_elmt(out, "mean left - right", diff);
    apop_data_add_named_elmt(out, "t statistic", stat);
    apop_data_add_named_elmt(out, "df", df);
    apop_data_add_named_elmt(out, "p value, 1 tail", GSL_MIN(pval,qval));
    apop_data_add_named_elmt(out, "confidence, 1 tail", 1 - GSL_MIN(pval,qval));
    apop_data_add_named_elmt(out, "p value, 2 tail", 1- two_tail);
    apop_data_add_named_elmt(out, "confidence, 2 tail", two_tail);
    return out;
}

/** Answers the question: with what confidence can I say that the means of these two columns of data are different?

If \c apop_opts.verbose is >=1, then display some information to stdout, like the mean/var/count for both vectors and the t statistic.

33 34
\param a one column of data
\param b another column of data
35
\return an \ref apop_data set with the following elements:
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    <tt>mean left - right</tt>:    the difference in means; if positive, first vector has larger mean, and one-tailed test is testing \f$L > R\f$, else reverse if negative.<br>
    <tt>t statistic</tt>:    used for the test<br>
    <tt>df</tt>:             degrees of freedom<br>
    <tt>p value, 1 tail</tt>: the p-value for a one-tailed test that one vector mean is greater than the other.<br>
    <tt>confidence, 1 tail</tt>: 1- p value.<br>
    <tt>p value, 2 tail</tt>: the p-value for the two-tailed test that left mean = right mean.<br>
    <tt>confidence, 2 tail</tt>: 1-p value

Example usage:
\code
gsl_vector *L = apop_query_to_vector("select * from data where sex='M'");
gsl_vector *R = apop_query_to_vector("select * from data where sex='F'");
apop_data *test_out = apop_t_test(L, R);
printf("Reject the null hypothesis of no difference between M and F with %g%% confidence\n", apop_data_get(test_out, .rowname="confidence, 2 tail"));
\endcode

\see \ref apop_paired_t_test, which answers the question: with what confidence can I
say that the mean difference between the two columns is zero?
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
*/
apop_data *	apop_t_test(gsl_vector *a, gsl_vector *b){
    int a_count = a->size,
        b_count = b->size;
    double a_avg = apop_vector_mean(a);
    double a_var = (a_count > 1) ? apop_vector_var(a) : 0,
           b_avg = apop_vector_mean(b),
    b_var = b_count > 1 ? apop_vector_var(b): 0,
    stat = (a_avg - b_avg)/ sqrt(
                        (b_count > 1 ? b_var/(b_count-1) : 0) 
                        + (a_count > 1 ? a_var/(a_count-1) : 0) 
                        );
    if (apop_opts.verbose >=1){
        printf("1st avg: %g; 1st std dev: %g; 1st count: %i.\n", a_avg, sqrt(a_var), a_count);
        printf("2st avg: %g; 2st std dev: %g; 2nd count: %i.\n", b_avg, sqrt(b_var), b_count);
        printf("t-statistic: %g.\n", stat);
    }
    int df = a_count+b_count-2;
    return produce_t_test_output(df, stat, a_avg - b_avg);
}

/** Answers the question: with what confidence can I say that the mean difference between the two columns is zero?

77
If <tt>apop_opts.verbose >=2</tt>, then display some information, like the mean/var/count for both vectors and the t statistic, to stderr.
78

79 80
\param a A column of data
\param b A matched column of data
81
\return an \ref apop_data set with the following elements:
82 83 84 85 86 87 88 89 90
    <tt>mean left - right</tt>:    the difference in means; if positive, first vector has larger mean, and one-tailed test is testing \f$L > R\f$, else reverse if negative.<br>
    <tt>t statistic</tt>:    used for the test<br>
    <tt>df</tt>:             degrees of freedom<br>
    <tt>p value, 1 tail</tt>: the p-value for a one-tailed test that one vector mean is greater than the other.<br>
    <tt>confidence, 1 tail</tt>: 1- p value.<br>
    <tt>p value, 2 tail</tt>: the p-value for the two-tailed test that left mean = right mean.<br>
    <tt>confidence, 2 tail</tt>: 1-p value

\see \ref apop_t_test for an example, and for when the element-by-element difference between the vectors has no sensible interpretation.
91 92 93 94 95 96 97 98 99 100 101 102 103 104
*/
apop_data * apop_paired_t_test(gsl_vector *a, gsl_vector *b){
    gsl_vector *diff = gsl_vector_alloc(a->size);
    gsl_vector_memcpy(diff, a);
    gsl_vector_sub(diff, b);
    int count = a->size; 
    double avg = apop_vector_mean(diff),
    var = apop_vector_var(diff),
    stat = avg/ sqrt(var/(count-1));
    gsl_vector_free(diff);
    Apop_notify(2, "avg diff: %g; diff std dev: %g; count: %i; t-statistic: %g.\n", avg, sqrt(var), count, stat);
    return produce_t_test_output(count-1, stat, avg);
}

105 106
/** Runs an F-test specified by \c q and \c c. See
 the chapter on hypothesis testing in  <a href="http://modelingwithdata.org">Modeling With Data</a>, p 309, which will tell you that:
107 108 109 110 111
 \f[{N-K\over q}
 {({\bf Q}'\hat\beta - {\bf c})' [{\bf Q}' ({\bf X}'{\bf X})^{-1} {\bf Q}]^{-1} ({\bf Q}' \hat\beta - {\bf c})
 \over {\bf u}' {\bf u} } \sim F_{q,N-K},\f]
 and that's what this function is based on.

112 113 114 115 116 117 118
\param est An \ref apop_model that you have already calculated. (No default)
\param contrast  An \ref apop_data set whose matrix represents \f${\bf Q}\f$ and whose
    vector represents \f${\bf c}\f$. Each row represents a hypothesis. (Defaults:
    if matrix is \c NULL, it is set to the identity matrix with the top row missing. If
    the vector is \c NULL, it is set to a zero matrix of length equal to the height of
    the contrast matrix. Thus, if the entire \c apop_data set is NULL or omitted, we are
    testing the hypothesis that all but \f$\beta_1\f$ are zero.)
119

120 121
\return An \c apop_data set with a few variants on the confidence with which we can reject the joint hypothesis.
\todo There should be a way to get OLS and GLS to store \f$(X'X)^{-1}\f$. In fact, if you did GLS, this is invalid, because you need \f$(X'\Sigma X)^{-1}\f$, and I didn't ask for \f$\Sigma\f$.
122 123 124 125 126

\exception out->error='a'  Allocation error.
\exception out->error='d'  dimension-matching error.
\exception out->error='i'  matrix inversion error.
\exception out->error='m'  GSL math error.
127 128 129 130 131 132 133 134

\li There are two approaches to an \f$F\f$-test: the ANOVA approach, which is typically
    built around the claim that all effects but the mean are zero; and the more general
    regression form, which allows for any set of linear claims about the data. If you send
    a \c NULL contrast set, I will generate the set of linear contrasts that are equivalent
    to the ANOVA-type approach. This is why the top row of the default \f${\bf Q}\f$
    matrix is missing: there is no hypothesis test about the coefficient for the
    constant term. See the example below.
135
\li This function uses the \ref designated syntax for inputs.
136 137

\include f_test.c
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
*/
#ifdef APOP_NO_VARIADIC
apop_data * apop_f_test(apop_model *est, apop_data *contrast){
#else
apop_varad_head(apop_data *, apop_f_test){
    apop_model *apop_varad_var(est, NULL)
    Nullcheck_m(est, NULL);
    Nullcheck_d(est->data, NULL);
    apop_data * apop_varad_var(contrast, NULL);
    int free_data=0,free_matrix=0,free_vector=0;
    if (!contrast) contrast = apop_data_alloc(),free_data=1;
    if (!contrast->matrix) {
        int size = est->parameters->vector->size;
        contrast->matrix= gsl_matrix_calloc(size - 1, size);
        for (int i=1; i< size; i++)
            apop_data_set(contrast, i-1, i, 1);
    }
    if (!contrast->vector) contrast->vector = gsl_vector_calloc(contrast->matrix->size1),free_vector=1;

    apop_data *out = apop_f_test_base(est, contrast);
    if (free_data) {apop_data_free(contrast); return out;}
    if (free_matrix) gsl_matrix_free(contrast->matrix);
    if (free_vector) gsl_vector_free(contrast->vector);
    return out;
    return apop_f_test_base(est, contrast);
}

 apop_data * apop_f_test_base(apop_model *est, apop_data *contrast){
#endif
    apop_data *out = apop_data_alloc();
    Asprintf(&out->names->title, "F test");
    size_t contrast_ct = contrast->vector->size;
    Apop_stopif(contrast->matrix->size1 != contrast_ct,  out->error='d'; return out,
            0, "I counted %zu contrasts by the size of either contrast->vector or "
            "est->parameters->vector->size, but you gave me a matrix with %zu rows. Those should match."
            , contrast_ct, contrast->matrix->size1);
    double f_stat, pval;

    Get_vmsizes(est->data); //msize1, msize2
    int data_df = msize1 - contrast_ct;

    //Find (\underbar x)'(\underbar x), where (\underbar x) = the data with means removed
    long double means[msize2];
    for (int i=1; i< msize2; i++)
        means[i] = apop_vector_mean(Apop_cv(est->data, i));
    means[0]=0;// don't screw with the ones column.
    apop_data *xpx = apop_data_alloc(msize2, msize2);
    Apop_stopif(xpx->error, apop_data_free(xpx); out->error='a'; return out, 0, "allocation error");
    for (int i=0; i< msize2; i++)
        for (int j=0; j< msize2; j++){ //at this loop, we calculate one cell in the dot prouct
            long double total = 0;
            for (int c=0; c<msize1; c++)
                total +=  (gsl_matrix_get(est->data->matrix, c, i) - means[i])
                         *(gsl_matrix_get(est->data->matrix, c, j) - means[j]);
            apop_data_set(xpx, i, j, total);
        }

    apop_data xpxinv = (apop_data){.matrix=apop_matrix_inverse(xpx->matrix)};
    Apop_stopif(!xpxinv.matrix, out->error='i'; return out, 0, "inversion of X'X error");
    apop_data *qprimexpxinv = apop_dot(contrast, &xpxinv, 'm', 'm');
    apop_data *qprimexpxinvq = apop_dot(qprimexpxinv, contrast, 'm', 't');
    Apop_stopif(qprimexpxinvq->error || qprimexpxinv->error, out->error='m'; return out, 0, "broken dot");
    apop_data qprimexpxinvqinv = (apop_data){.matrix=apop_matrix_inverse(qprimexpxinvq->matrix)};
    Apop_stopif(!qprimexpxinvqinv.matrix, out->error='i'; return out, 0, "inversion of Q'(X'X)^{-1}Q error");
    apop_data_free(qprimexpxinvq);
    apop_data_free(qprimexpxinv);
    apop_data *qprimebeta = apop_dot(contrast, est->parameters, 'm', 'v');
    Apop_stopif(qprimebeta->error, out->error='m'; return out, 0, "broken dot");
    gsl_vector_sub(qprimebeta->vector, contrast->vector);
    apop_data *qprimebetaminusc_qprimexpxinvqinv = apop_dot(&qprimexpxinvqinv, qprimebeta, .form2='v');
    Apop_stopif(qprimebetaminusc_qprimexpxinvqinv->error, out->error='m'; return out, 0, "broken dot");
    gsl_blas_ddot(qprimebeta->vector, qprimebetaminusc_qprimexpxinvqinv->vector, &f_stat);
    apop_data_free(xpx);
    apop_data_free(qprimebeta);
    apop_data_free(qprimebetaminusc_qprimexpxinvqinv);

    apop_data *r_sq_list = apop_estimate_coefficient_of_determination (est);
    double variance = apop_data_get(r_sq_list, .rowname="sse");
    f_stat *=  data_df / (variance * contrast_ct);
    pval    = (contrast_ct > 0 && data_df > 0) ? gsl_cdf_fdist_Q(f_stat, contrast_ct, data_df): GSL_NAN; 

    apop_data_add_named_elmt(out, "F statistic", f_stat);
    apop_data_add_named_elmt(out, "p value", pval);
    apop_data_add_named_elmt(out, "confidence", 1- pval);
    apop_data_add_named_elmt(out, "df1", contrast_ct);
    apop_data_add_named_elmt(out, "df2", data_df);
    return out;
}

static double one_chi_sq(apop_data *d, int row, int col, int n){
    double rowexp  = apop_vector_sum(Apop_rv(d, row))/n;
    double colexp  = apop_vector_sum(Apop_cv(d, col))/n;
    double observed = apop_data_get(d, row, col);
    double expected = n * rowexp * colexp;
    return gsl_pow_2(observed - expected)/expected; 
}

/** Run a Chi-squared test on an ANOVA table, i.e., an NxN table with the null hypothesis that all cells are equally likely.

 \param d The input data, which is a crosstab of various elements. They don't have to sum to one.
238 239 240
\return A \ref apop_data set including elements named
     <tt>"chi squared statistic"</tt>, <tt>"df"</tt>, and <tt>"p value"</tt>. Retrieve via,
     e.g., <tt>apop_data_get(out, .rowname="p value")</tt>.
241
 \see apop_test_fisher_exact
242
*/
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
apop_data * apop_test_anova_independence(apop_data *d){
    Apop_stopif(!d || !d->matrix, return NULL, 0, "You sent me data with no matrix element. Returning NULL.");
    double total = 0;
    //You can have a one-column or one-row matrix if you want; else df = (rows-1)*(cols-1)
    double df = d->matrix->size1==1 ? d->matrix->size2-1 : d->matrix->size2 == 1 ? d->matrix->size1 
                              : (d->matrix->size1 - 1)* (d->matrix->size2 - 1);
    Apop_stopif(!df, return NULL, 0, "You sent a degenerate matrix. Returning NULL.");
    int n = apop_matrix_sum(d->matrix);
    for (size_t row=0; row <d->matrix->size1; row++)
        for (size_t col=0; col <d->matrix->size2; col++)
            total += one_chi_sq(d, row, col, n);
    apop_data *out = apop_data_alloc();
    double chisq = gsl_cdf_chisq_Q(total, df);
    apop_data_add_named_elmt(out, "chi squared statistic", total);
    apop_data_add_named_elmt(out, "df", df);
    apop_data_add_named_elmt(out, "p value", chisq);
    return out;
}


static apop_data* apop_anova_one_way(char *table, char *data, char *grouping){
    //ANOVA has always just been a process of filling in a form, and
    //that's what this function does.
    apop_data *out = apop_data_calloc(3, 6);
    apop_name_add(out->names, "sum of squares", 'c');
    apop_name_add(out->names, "df", 'c');
    apop_name_add(out->names, "mean squares", 'c');
    apop_name_add(out->names, "F ratio", 'c');
    apop_name_add(out->names, "p value", 'c');
    apop_name_add(out->names, "confidence", 'c');
    apop_name_add(out->names, grouping, 'r');
    apop_name_add(out->names, "residual", 'r');
    apop_name_add(out->names, "total", 'r');
 
    //total sum of squares:
    apop_data* tss = apop_query_to_data("select var_pop(%s), count(*) from %s", data, table);
    Apop_stopif(!tss, apop_return_data_error('q'), 0, "Query 'select var_pop(%s), count(*) from %s' returned NULL. Does that look right to you?", data, table);
    apop_data_set(out, 2, 0, apop_data_get(tss, 0, 0)*apop_data_get(tss, 0, 1)); //total sum of squares
    double total_df = apop_data_get(tss, 0, 1);
    apop_data_set(out, 2, 1, apop_data_get(tss, 0, 1)); //total df.

    //within group sum of squares:
    apop_data* wss = apop_query_to_data("select var_pop(%s), count(*) from %s group by %s", data,  table, grouping);
    double group_df = wss->matrix->size1-1;
    apop_data_set(out, 0, 0, apop_data_get(wss, 0, 0)*group_df); //within sum of squares
    apop_data_set(out, 0, 1, group_df);

    //residuals are just total-wss
    apop_data_set(out, 1, 0, apop_data_get(out, 2, 0) - apop_data_get(out, 0,0)); //residual sum of squares
    double residual_df = total_df - group_df;
    apop_data_set(out, 1, 1, residual_df); //residual df

    apop_data_set(out, 0, 2, apop_data_get(out, 0, 0)/apop_data_get(out, 0, 1));//mean SS within
    apop_data_set(out, 1, 2, apop_data_get(out, 1, 0)/apop_data_get(out, 1, 1));//mean SS residual

    apop_data_set(out, 0, 3, apop_data_get(out, 0, 2)/apop_data_get(out, 1, 2));//F ratio
    apop_data_set(out, 0, 4, gsl_cdf_fdist_P(apop_data_get(out, 0, 3), group_df, residual_df));//pval
    apop_data_set(out, 0, 5, 1- apop_data_get(out, 0, 4));//confidence

    apop_data_free(tss);
    apop_data_free(wss);
    return out;
}

/** This function produces a traditional one- or two-way ANOVA table. It
308
  works from data in an SQL table, using queries of a form like <tt>select
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
  data from table group by grouping1, grouping2</tt>.

  \param table The table to be queried. Anything that can go in an SQL <tt>from</tt> clause is OK, so this can be a plain table name or a temp table specification like <tt>(select ... )</tt>, with parens.
  \param data The name of the column holding the count or other such data
  \param grouping1 The name of the first column by which to group data
  \param grouping2 If this is \c NULL, then the function will return a one-way ANOVA. Otherwise, the name of the second column by which to group data in a two-way ANOVA.
 */
#ifdef APOP_NO_VARIADIC
apop_data* apop_anova(char *table, char *data, char *grouping1, char *grouping2){
#else
apop_varad_head(apop_data*, apop_anova){
    char *apop_varad_var(table, NULL)
    Apop_stopif(!table, return NULL, 0, "I need the name of a table in the SQL database.");
    if (!strchr(table, ')')) //if you found ()s,then it is a temp table spec.
        Apop_stopif(!apop_table_exists(table), return NULL, 0, "I couldn't find the table %s in the database.", table);
    char *apop_varad_var(data, NULL)
    Apop_stopif(!data, return NULL, 0, "I need the name of the column in the %s table with the count or other data.", table);
    char *apop_varad_var(grouping1, NULL)
    Apop_stopif(!data, return NULL, 0, "I need at least grouping1, a column in the %s table.", table);
    char *apop_varad_var(grouping2, NULL)
    return apop_anova_base(table, data, grouping1, grouping2);
}

 apop_data* apop_anova_base(char *table, char *data, char *grouping1, char *grouping2){
#endif
    apop_data *first = apop_anova_one_way(table, data, grouping1);
    Apop_stopif(first->error, return first, 0, "Error (%c) running one-way ANOVA.", first->error);
    if (!grouping2) return first;
    apop_data *second = apop_anova_one_way(table, data, grouping2);
    char *joined = NULL;
    Asprintf(&joined, "%s, %s", grouping1, grouping2);
    apop_data *interaction = apop_anova_one_way(table, data, joined);
    apop_data *out = apop_data_calloc(5, 6);
    apop_name_stack(out->names, first->names, 'c');
    apop_data_add_names(out, 'r', first->names->row[0], second->names->row[0],
                                  "interaction", "residual", "total");
    gsl_vector *firstrow = Apop_rv(first, 0);
    gsl_vector *secondrow = Apop_rv(second, 0);
    gsl_vector *interrow = Apop_rv(interaction, 0);
    gsl_matrix_set_row(out->matrix, 0, firstrow);
    gsl_matrix_set_row(out->matrix, 1, secondrow);
    gsl_matrix_set_row(out->matrix, 2, interrow);
    gsl_matrix_set_row(out->matrix, 4, Apop_rv(first, 2));
    
    //residuals are just total-wss
    apop_data_set(out, 3, 0, apop_data_get(out, 4, 0) 
            - gsl_vector_get(firstrow, 0) - gsl_vector_get(secondrow, 0) - gsl_vector_get(interrow, 0)); //residual sum of squares
    double residual_df = apop_data_get(out, 4, 1) 
            - gsl_vector_get(firstrow, 1) - gsl_vector_get(secondrow, 1) - gsl_vector_get(interrow, 1); //residual df
    apop_data_set(out, 3, 1, residual_df);

    apop_data_set(out, 3, 2, apop_data_get(out, 3, 0)/apop_data_get(out, 3, 1));//mean SS residual

    apop_data_set(out, 0, 3, apop_data_get(out, 0, 2)/apop_data_get(out, 3, 2));//F ratio
    apop_data_set(out, 0, 4, gsl_cdf_fdist_P(apop_data_get(out, 0, 3), gsl_vector_get(firstrow, 1), residual_df));//pval
    apop_data_set(out, 0, 5, 1- apop_data_get(out, 0, 4));//confidence

    apop_data_set(out, 1, 3, apop_data_get(out, 1, 2)/apop_data_get(out, 3, 2));//F ratio
    apop_data_set(out, 1, 4, gsl_cdf_fdist_P(apop_data_get(out, 1, 3), gsl_vector_get(secondrow, 1), residual_df));//pval
    apop_data_set(out, 1, 5, 1- apop_data_get(out, 1, 4));//confidence

    apop_data_set(out, 2, 3, apop_data_get(out, 2, 2)/apop_data_get(out, 3, 2));//F ratio
    apop_data_set(out, 2, 4, gsl_cdf_fdist_P(apop_data_get(out, 2, 3), gsl_vector_get(interrow, 1), residual_df));//pval
    apop_data_set(out, 2, 5, 1- apop_data_get(out, 2, 4));//confidence

    free(joined);
    apop_data_free(first);
    apop_data_free(second);
    apop_data_free(interaction);
    return out;
}

/** This is a convenience function to do the lookup of a given statistic along a given
distribution. You give me a statistic, its (hypothesized) distribution, and whether
to use the upper tail, lower tail, or both. I will return the odds of a Type I error
given the model---in statistician jargon, the \f$p\f$-value.  [Type I error: odds of
rejecting the null hypothesis when it is true.]

   For example, 
   \code
   apop_test(1.3);
   \endcode

will return the density of the standard Normal distribution that is more than 1.3 from zero.  
If this function returns a small value, we can be confident that the statistic is significant. Or, 
   \code
395
   apop_test(1.3, "t", 10, .tail='u');
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
   \endcode

will give the appropriate odds for an upper-tailed test using the \f$t\f$-distribution with 10 degrees of freedom (e.g., a \f$t\f$-test of the null hypothesis that the statistic is less than or equal to zero).

Several more distributions are supported; see below.

 \li For a two-tailed test (the default), this returns the density outside the range. I'll only do this for symmetric distributions.
 \li For an upper-tail test ('u'), this returns the density above the cutoff
 \li For a lower-tail test ('l'), this returns the density below the cutoff 
 
\param statistic    The scalar value to be tested.  
\param distribution  The name of the distribution; see below.
\param p1  The first parameter for the distribution; see below.
\param p2  The second parameter for the distribution; see below.
\param tail 'u' = upper tail; 'l' = lower tail; anything else = two-tailed. (default = two-tailed)

\return The odds of a Type I error given the model (the \f$p\f$-value).

414
Here are the distributions you can use and their parameters.
415 416

\c "normal" or \c "gaussian" 
417
\li p1=\f$\mu\f$, p2=\f$\sigma\f$
418 419 420
\li default (0, 1)

\c "lognormal"  
421
\li p1=\f$\mu\f$, p2=\f$\sigma\f$
422
\li default (0, 1) 
423
\li Remember, \f$\mu\f$ and \f$\sigma\f$ refer to the Normal one would get after exponentiation
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
\li One-tailed tests only

\c "uniform"  
\li p1=lower edge, p2=upper edge
\li default (0, 1)
\li two-tailed tests are run relative to the center, (p1+p2)/2.

\c "t"  
\li p1=df
\li no default

\c "chi squared", \c "chi", \c "chisq": 
\li p1=df
\li no default
\li One-tailed tests only; default='u' (\f$p\f$-value for typical cases)

\c "f"  
\li p1=df1, p2=df2
\li no default
\li One-tailed tests only

\li This function uses the \ref designated syntax for inputs.
*/
#ifdef APOP_NO_VARIADIC
double apop_test(double statistic, char *distribution, double p1, double p2, char tail){
#else
apop_varad_head(double, apop_test){
    double  apop_varad_var(statistic, 0);
    char*  apop_varad_var(distribution, NULL);
    double apop_varad_var(p1, 0);
    double apop_varad_var(p2, 0);
455 456
    int is_chi = !strcasecmp(distribution, "chi squared")|| !strcasecmp(distribution, "chi")
                     || !strcasecmp(distribution, "chisq");
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
     Apop_stopif(!strcasecmp(distribution, "f") && (!p1 || !p2), return NAN, 0, "I need both a p1 and p2 parameter specifying the degrees of freedom.");
     Apop_stopif((!strcasecmp(distribution, "t") || !strcasecmp(distribution, "f") || is_chi)
             && !p1, return NAN, 0, "I need a p1 parameter specifying the degrees of freedom.");
     if (!p2 && (!distribution || !strcasecmp(distribution, "normal") || !strcasecmp(distribution, "gaussian") ))
         p2 = 1;
     if (!p2 && p1 >= 0 && !strcasecmp(distribution, "uniform"))
         p2 = 1;

    char apop_varad_var(tail, 0);
    if (!tail) tail = is_chi ? 'u' : 'a';
    return apop_test_base(statistic, distribution, p1, p2, tail);
}

 double apop_test_base(double statistic, char *distribution, double p1, double p2, char tail){
#endif
    //This is a long and boring function. I am aware that there are
    //clever way to make it shorter.
     if (!distribution || !strcasecmp(distribution, "normal") || !strcasecmp(distribution, "gaussian") ){
         if (tail == 'u')
             return gsl_cdf_gaussian_Q(p1-statistic, p2);
         else if (tail == 'l')
             return gsl_cdf_gaussian_P(p1-statistic, p2);
         else
             return 2 * gsl_cdf_gaussian_Q(fabs(p1-statistic), p2);
     }
    else if (!strcasecmp(distribution, "lognormal")){
         if (tail == 'u')
             return gsl_cdf_lognormal_Q(statistic, p1, p2);
         else if (tail == 'l')
             return gsl_cdf_lognormal_P(statistic, p1, p2);
         else
             Apop_assert(0, "A two-tailed test doesn't really make sense for the lognormal. Please specify either tail= 'u' or tail= 'l'.");
     }
    else if (!strcasecmp(distribution, "t")){
         if (tail == 'u')
             return gsl_cdf_tdist_Q(statistic, p1);
         else if (tail == 'l')
             return gsl_cdf_tdist_P(statistic, p1);
         else
             return 2 * gsl_cdf_tdist_Q(fabs(statistic), p1);
     }
    else if (!strcasecmp(distribution, "f")){
         if (tail == 'u')
             return gsl_cdf_fdist_Q(statistic, p1, p2);
         else if (tail == 'l')
             return gsl_cdf_fdist_P(statistic, p1, p2);
         else
             Apop_assert(0, "A two-tailed test doesn't really make sense for the %s. Please specify either tail= 'u' or tail= 'l'.", distribution);
     }
    else if (!strcasecmp(distribution, "chi squared")|| !strcasecmp(distribution, "chi")
                                                || !strcasecmp(distribution, "chisq")){
         if (tail == 'u')
             return gsl_cdf_chisq_Q(statistic, p1);
         else if (tail == 'l')
             return gsl_cdf_chisq_P(statistic, p1);
         else
             Apop_assert(0, "A two-tailed test doesn't really make sense for the %s. Please specify either tail= 'u' or tail= 'l'.", distribution);
     }
    else if (!strcasecmp(distribution, "uniform")){
         if (tail == 'u')
             return gsl_cdf_flat_Q(statistic, p1, p2);
         else if (tail == 'l')
             return gsl_cdf_flat_P(statistic, p1, p2);
         else
             return 2 * gsl_cdf_flat_Q(fabs(statistic - (p1+p2)/2.), p1, p2);
     }
    Apop_assert(0, "Sorry, but I don't recognize %s as a distribution", distribution);
}