@string{aistats8 = {8th International Conference on Artificial Intelligence and Statistics}}
@string{aistats9 = {9th International Conference on Artificial Intelligence and Statistics}}
@string{aistats10 = {10th International Conference on Artificial Intelligence and Statistics}}
@string{aistats11 = {11th International Conference on Artificial Intelligence and Statistics}}
@string{aistats12 = {12th International Conference on Artificial Intelligence and Statistics}}
@string{aistats13 = {13th International Conference on Artificial Intelligence and Statistics}}
@string{aistats14 = {14th International Conference on Artificial Intelligence and Statistics}}
@string{icml20 = {20th International Conference on Machine Learning}}
@string{icml21 = {21st International Conference on Machine Learning}}
@string{icml22 = {22nd International Conference on Machine Learning}}
@string{icml23 = {23rd International Conference on Machine Learning}}
@string{icml24 = {24th International Conference on Machine Learning}}
@string{icml25 = {25th International Conference on Machine Learning}}
@string{icml26 = {26th International Conference on Machine Learning}}
@string{icml27 = {27th International Conference on Machine Learning}}
@string{icml28 = {28th International Conference on Machine Learning}}
@string{nips8  = {Advances in Neural Information Processing Systems 8}}
@string{nips9  = {Advances in Neural Information Processing Systems 9}}
@string{nips10 = {Advances in Neural Information Processing Systems 10}}
@string{nips11 = {Advances in Neural Information Processing Systems 11}}
@string{nips12 = {Advances in Neural Information Processing Systems 12}}
@string{nips13 = {Advances in Neural Information Processing Systems 13}}
@string{nips14 = {Advances in Neural Information Processing Systems 14}}
@string{nips15 = {Advances in Neural Information Processing Systems 15}}
@string{nips16 = {Advances in Neural Information Processing Systems 16}}
@string{nips17 = {Advances in Neural Information Processing Systems 17}}
@string{nips18 = {Advances in Neural Information Processing Systems 18}}
@string{nips19 = {Advances in Neural Information Processing Systems 19}}
@string{nips20 = {Advances in Neural Information Processing Systems 20}}
@string{nips21 = {Advances in Neural Information Processing Systems 21}}
@string{nips22 = {Advances in Neural Information Processing Systems 22}}
@string{nips23 = {Advances in Neural Information Processing Systems 23}}
@string{nips24 = {Advances in Neural Information Processing Systems 24}}
@string{nips25 = {Advances in Neural Information Processing Systems 25}}
@string{cogsci32 = {The Proceedings of the 32nd Annual Meeting of the Cognitive Science Society}}
@String{uai22 = {22nd Conference on Uncertainty in Artificial Intelligence}}
@String{uai27 = {27nd Conference on Uncertainty in Artificial Intelligence}}

@string{cup    = {Cambridge University Press}}
@string{AAAI   = {AAAI Press}}
@string{mit    = {The {MIT} Press}}
@string{oup    = {Oxford University Press}}
@string{WILEY  = {John Wiley \& Sons}}
@String{rss9   = {9th International Conference on Robotics: Science \& Systems}}
@string{IECY   = {IEEE Transactions on System Science and Cybernetics}}
@string{IENN   = {IEEE Transactions on Neural Networks}}
@string{PAMI   = {IEEE Transactions on Pattern Analysis and Machine Intelligence}}
@string{tcbb   = {IEEE/ACM Transactions on Computational Biology and Bioinformatics}}
@string{jasa   = {Journal of the Americal Statistical Association}}
@string{jmlr   = {Journal of Machine Learning Research}}
@string{jrssB  = {Journal of the Royal Statistical Society, Series B}}
@string{jcst   = {Journal of Computer Science and Technology}}
@string{nc     = {Neural Computation}}
@string{PCPS   = {Proceedings of the Cambridge Philosophical Society}}
@string{tams   = {The Annals of Mathematical Statistics}}
@string{lncs   = {Lecture Notes in Computer Science (LNCS)}}

@inproceedings{Roy11,
  cat =		 {np gm},
  author =	 {Daniel M. Roy},
  title =	 {On the computability and complexity of Bayesian reasoning},
  booktitle =	 {NIPS Workshop on Philosophy and Machine Learning},
  year =	 2011,
  url =		 {http://danroy.org/papers/Roy-NIPSPML-2011.pdf},
  abstract =	 { If we consider the claim made by some cognitive scientists that
                   the mind performs Bayesian reasoning, and if we simultaneously
                   accept the Physical Church-Turing thesis and thus believe that
                   the computational power of the mind is no more than that of a
                   Turing machine, then what limitations are there to the reasoning
                   abilities of the mind?  I give an overview of joint work with
                   Nathanael Ackerman (Harvard, Mathematics) and Cameron Freer
                   (MIT, CSAIL) that bears on the computability and complexity of
                   Bayesian reasoning.  In particular, we prove that conditional
                   probability is in general not computable in the presence of
                   continuous random variables.  However, in light of additional
                   structure in the prior distribution, such as the presence of
                   certain types of noise, or of exchangeability, conditioning is
                   possible.  These results cover most of statistical practice.
                   At the workshop on Logic and Computational Complexity, we presented
                   results on the computational complexity of conditioning,
                   embedding sharp-P-complete problems in the task of computing
                   conditional probabilities for diffuse continuous random variables.
                   This work complements older work. For example, under
                   cryptographic assumptions, the computational complexity of
                   producing samples and computing probabilities was separated by
                   Ben-David, Chor, Goldreich and Luby.  In recent work, we also
                   make use of cryptographic assumptions to show that different
                   representations of exchangeable sequences may have vastly different
                   complexity.  However, when faced with an adversary that is
                   computational bounded, these different representations have the
                   same complexity, highlighting the fact that knowledge
                   representation and approximation play a fundamental role in the
                   possibility and plausibility of Bayesian reasoning.}
}

@inproceedings{SonRoy11,
  cat =		 {approx},
  author =	 {David Sontag and Daniel M. Roy},
  title =	 {The {C}omplexity of {I}nference in {L}atent {D}irichlet
                  {A}llocation},
  booktitle =	 nips24,
  year =	 2011,
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  url =		 {http://danroy.org/papers/SonRoy-NIPS-2011.pdf},
  abstract =	 { We consider the computational complexity of probabilistic
                   inference in Latent Dirichlet Allocation (LDA).  First, we
                   study the problem of finding the maximum a posteriori (MAP)
                   assignment of topics to words, where the document's topic
                   distribution is integrated out.  We show that, when the
                   effective number of topics per document is small, exact
                   inference takes polynomial time. In contrast, we show that,
                   when a document has a large number of topics, finding the MAP
                   assignment of topics to words in LDA is NP-hard.  Next, we
                   consider the problem of finding the MAP topic distribution
                   for a document, where the topic-word assignments are integrated
                   out.  We show that this problem is also NP-hard.  Finally,
                   we briefly discuss the problem of sampling from the posterior,
                   showing that this is NP-hard in one restricted setting,
                   but leaving open the general question. }
}

@inproceedings{AbbHelGrietal11,
  cat =		 {clust},
  author =	 {Joshua Abbott and Katherine A. Heller and Zoubin
                  Ghahramani and Thomas L. Griffiths},
  title =	 {Testing a {Bayesian} Measure of Representativeness
                  Using a Large Image Database},
  booktitle =	 nips24,
  year =	 2011,
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {How do people determine which elements of a set are
                  most representative of that set? We extend an
                  existing Bayesian measure of representativeness,
                  which indicates the representativeness of a sample
                  from a distribution, to deﬁne a measure of the
                  representativeness of an item to a set. We show that
                  this measure is formally related to a machine
                  learning method known as Bayesian Sets. Building on
                  this connection, we derive an analytic expression
                  for the representativeness of objects described by a
                  sparse vector of binary features. We then apply this
                  measure to a large database of images, using it to
                  determine which images are the most representative
                  members of different sets. Comparing the resulting
                  predictions to human judgments of representativeness
                  provides a test of this measure with naturalistic
                  stimuli, and illustrates how databases that are more
                  commonly used in computer vision and machine
                  learning can be used to evaluate psychological
                  theories.}
}

@inproceedings{AdaGha09,
  cat =		 {ssl np gp},
  booktitle =	 icml26,
  title =	 {Archipelago: nonparametric {B}ayesian
                  semi-supervised learning},
  author =	 {R.~Adams and Zoubin Ghahramani},
  year =	 2009,
  address =	 {Montr\'{e}al, QC, Canada},
  month =	 {June},
  publisher =	 {Omnipress},
  pages =	 {1--8},
  editor =	 {L\'{e}on Bottou and Michael Littman},
  url =		 {.},
  abstract =	 {Semi-supervised learning (SSL), is classification
                  where additional unlabeled data can be used to
                  improve accuracy. Generative approaches are
                  appealing in this situation, as a model of the
                  data's probability density can assist in identifying
                  clusters. Nonparametric Bayesian methods, while
                  ideal in theory due to their principled motivations,
                  have been difficult to apply to SSL in practice. We
                  present a nonparametric Bayesian method that uses
                  Gaussian processes for the generative model,
                  avoiding many of the problems associated with
                  Dirichlet process mixture models. Our model is fully
                  generative and we take advantage of recent advances
                  in Markov chain Monte Carlo algorithms to provide a
                  practical inference method. Our method compares
                  favorably to competing approaches on synthetic and
                  real-world multi-class data.},
  annote =	 {This paper was awarded Honourable Mention for Best
                  Paper at ICML 2009.}
}

@inproceedings{AdaGhaJor10,
  cat =		 {np clust},
  author =	 {Ryan P. Adams and Zoubin Ghahramani and Michael
                  I. Jordan},
  year =	 2010,
  title =	 {Tree-Structured Stick Breaking for Hierarchical
                  Data},
  booktitle =	 nips23,
  publisher =	 mit,
  url =		 {.},
  abstract =	 {Many data are naturally modeled by an unobserved
                  hierarchical structure. In this paper we propose a
                  flexible nonparametric prior over unknown data
                  hierarchies. The approach uses nested stick-breaking
                  processes to allow for trees of unbounded width and
                  depth, where data can live at any node and are
                  infinitely exchangeable. One can view our model as
                  providing infinite mixtures where the components
                  have a dependency structure corresponding to an
                  evolutionary diffusion down a tree. By using a
                  stick-breaking approach, we can apply Markov chain
                  Monte Carlo methods based on slice sampling to
                  perform Bayesian inference and simulate from the
                  posterior distribution on trees. We apply our method
                  to hierarchical clustering of images and topic
                  modeling of text data. }
}

@inproceedings{AdaWalGha10,
  cat =		 {np gm},
  author =	 {R.~P.~Adams and H.~Wallach and Zoubin Ghahramani},
  year =	 2010,
  month =	 {May},
  editor =	 {Yee Whye Teh and Mike Titterington},
  title =	 {Learning the Structure of Deep Sparse Graphical
                  Models},
  booktitle =	 aistats13,
  address =	 {Chia Laguna, Sardinia, Italy},
  pages =	 {1--8},
  url =		 {.},
  abstract =	 {Deep belief networks are a powerful way to model
                  complex probability distributions. However, it is
                  difficult to learn the structure of a belief
                  network, particularly one with hidden units. The
                  Indian buffet process has been used as a
                  nonparametric Bayesian prior on the structure of a
                  directed belief network with a single infinitely
                  wide hidden layer. Here, we introduce the cascading
                  Indian buffet process (CIBP), which provides a prior
                  on the structure of a layered, directed belief
                  network that is unbounded in both depth and width,
                  yet allows tractable inference. We use the CIBP
                  prior with the nonlinear Gaussian belief network
                  framework to allow each unit to vary its behavior
                  between discrete and continuous representations. We
                  use Markov chain Monte Carlo for inference in this
                  model and explore the structures learned on image
                  data.},
  annote =	 {Winner of the Best Paper Award}
}

@article{AndSzyRasetal02,
  cat =		 {gp},
  author =	 {Irene K.~Andersen and Anna Szymkowiak and Carl
                  Edward Rasmussen and L.~G.~Hanson and
                  J.~R.~Marstrand and H.~B.~W.~Larsson and Lars Kai
                  Hansen},
  title =	 {Perfusion Quantification using {G}aussian Process
                  Deconvolution},
  url =		 {.},
  doi =		 {10.1002/mrm.10213},
  journal =	 {Magnetic Resonance in Medicine},
  volume =	 48,
  number =	 2,
  pages =	 {351--361},
  year =	 2002,
  abstract =	 {The quantification of perfusion using dynamic
                  susceptibility contrast MR imaging requires
                  deconvolution to obtain the residual
                  impulse-response function (IRF). Here, a method
                  using a Gaussian process for deconvolution, GPD, is
                  proposed. The fact that the IRF is smooth is
                  incorporated as a constraint in the method. The GPD
                  method, which automatically estimates the noise
                  level in each voxel, has the advantage that model
                  parameters are optimized automatically. The GPD is
                  compared to singular value decomposition (SVD) using
                  a common threshold for the singular values and to
                  SVD using a threshold optimized according to the
                  noise level in each voxel. The comparison is carried
                  out using artificial data as well as using data from
                  healthy volunteers. It is shown that GPD is
                  comparable to SVD variable optimized threshold when
                  determining the maximum of the IRF, which is
                  directly related to the perfusion. GPD provides a
                  better estimate of the entire IRF. As the signal to
                  noise ratio increases or the time resolution of the
                  measurements increases, GPD is shown to be superior
                  to SVD. This is also found for large distribution
                  volumes.}
}

@inproceedings{ArnSchLar09,
  title =	 {Bayesian nonnegative matrix factorization with
                  volume prior for unmixing of hyperspectral images},
  author =	 {Morten Arngren and Mikkel N.~Schmidt and Jan Larsen},
  booktitle =	 {Machine Learning for Signal Processing, IEEE
                  Workshop on (MLSP)},
  address =	 {Grenoble, France},
  month =	 {September},
  pages =	 {1--6},
  year =	 2009,
  isbn =	 {978-1-4244-4947-7},
  doi =		 {10.1109/MLSP.2009.5306262},
  url =		 {.},
  abstract =	 {In hyperspectral image analysis the objective is to
                  unmix a set of acquired pixels into pure spectral
                  signatures (endmembers) and corresponding fractional
                  abundances. The Non-negative Matrix Factorization
                  (NMF) methods have received a lot of attention for
                  this unmixing process. Many of these NMF based
                  unmixing algorithms are based on sparsity
                  regularization encouraging pure spectral endmembers,
                  but this is not optimal for certain applications,
                  such as foods, where abundances are not sparse. The
                  pixels will theoretically lie on a simplex and hence
                  the endmembers can be estimated as the vertices of
                  the smallest enclosing simplex. In this context we
                  present a Bayesian framework employing a volume
                  constraint for the NMF algorithm, where the
                  posterior distribution is numerically sampled from
                  using a Gibbs sampling procedure. We evaluate the
                  method on synthetical and real hyperspectral data of
                  wheat kernels.},
  annote =	 {This paper was "rated among the best papers
                  submitted" to the 2009 Machine Learning for Signal
                  Processing conference.}
}

@inproceedings{Azr07,
  cat =		 {ssl},
  author =	 {Arik Azran},
  title =	 {The {R}endezvous algorithm: multiclass
                  semi-supervised learning with {M}arkov random walks},
  booktitle =	 icml24,
  address =	 {Corvallis, OR, USA},
  editor =	 {Zoubin Ghahramani},
  publisher =	 {Omnipress},
  year =	 2007,
  month =	 {June},
  pages =	 {49--56},
  url =		 {.},
  abstract =	 {We consider the problem of multiclass classification
                  where both labeled and unlabeled data points are
                  given. We introduce and demonstrate a new approach
                  for estimating a distribution over the missing
                  labels where data points are viewed as nodes of a
                  graph, and pairwise similarities are used to derive
                  a transition probability matrix P for a Markov
                  random walk between them. The algorithm associates
                  each point with a particle which moves between
                  points according to P. Labeled points are set to be
                  absorbing states of the Markov random walk, and the
                  probability of each particle to be absorbed by the
                  different labeled points, as the number of steps
                  increases, is then used to derive a distribution
                  over the associated missing label. A computationally
                  efficient algorithm to implement this is derived and
                  demonstrated on both real and artificial data sets,
                  including a numerical comparison with other
                  methods.}
}

@inproceedings{AzrGha06,
  author =	 {Arik Azran and Zoubin Ghahramani},
  title =	 {A new approach to data driven clustering},
  booktitle =	 icml23,
  publisher =	 {Omnipress},
  editor =	 {William Cohen and Andrew Moore},
  address =	 {Pittsburgh, PA, USA},
  pages =	 {57--64},
  year =	 2006,
  month =	 {June},
  url =		 {.},
  abstract =	 {We consider the problem of clustering in its most
                  basic form where only a local metric on the data
                  space is given. No parametric statistical model is
                  assumed, and the number of clusters is learned from
                  the data. We introduce, analyze and demonstrate a
                  novel approach to clustering where data points are
                  viewed as nodes of a graph, and pairwise
                  similarities are used to derive a transition
                  probability matrix P for a Markov random walk
                  between them. The algorithm automatically reveals
                  structure at increasing scales by varying the number
                  of steps taken by this random walk. Points are
                  represented as rows of Pt, which are the t-step
                  distributions of the walk starting at that point;
                  these distributions are then clustered using a
                  KL-minimizing iterative algorithm. Both the number
                  of clusters, and the number of steps that best
                  reveal it, are found by optimizing spectral
                  properties of P.}
}

@inproceedings{AzrGha06b,
  author =	 {Arik Azran and Zoubin Ghahramani},
  title =	 {Spectral Methods for Automatic Multiscale Data
                  Clustering},
  booktitle =	 {IEEE Conference on Computer Vision and Pattern
                  Recognition (CVPR)},
  year =	 2006,
  month =	 {June},
  address =	 {New York, NY, USA},
  publisher =	 {IEEE Computer Society},
  isbn =	 {0-7695-2597-0},
  pages =	 {190--197},
  doi =		 {10.1109/CVPR.2006.289},
  url =		 {.},
  abstract =	 {Spectral clustering is a simple yet powerful method
                  for finding structure in data using spectral
                  properties of an associated pairwise similarity
                  matrix. This paper provides new insights into how
                  the method works and uses these to derive new
                  algorithms which given the data alone automatically
                  learn different plausible data partitionings. The
                  main theoretical contribution is a generalization of
                  a key result in the field, the multicut lemma (Meila
                  2001). We use this generalization to derive two
                  algorithms. The first uses the eigenvalues of a
                  given affinity matrix to infer the number of
                  clusters in data, and the second combines learning
                  the affinity matrix with inferring the number of
                  clusters. A hierarchical implementation of the
                  algorithms is also derived. The algorithms are
                  theoretically motivated and demonstrated on
                  nontrivial data sets.}
}

@inproceedings{BeaGhaRas02,
  cat =		 {np},
  author =	 {Matthew J.~Beal and Zoubin Ghahramani and Carl
                  Edward Rasmussen},
  title =	 {The Infinite Hidden {M}arkov Model},
  booktitle =	 nips14,
  pages =	 {577--584},
  editors =	 {T. Dietterich, S. Becker, Z. Ghahramani},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  year =	 2002,
  month =	 {December},
  url =		 {.},
  abstract =	 {We show that it is possible to extend hidden Markov
                  models to have a countably infinite number of hidden
                  states. By using the theory of Dirichlet processes
                  we can implicitly integrate out the infinitely many
                  transition parameters, leaving only three
                  hyperparameters which can be learned from
                  data. These three hyperparameters define a
                  hierarchical Dirichlet process capable of capturing
                  a rich set of transition dynamics. The three
                  hyperparameters control the time scale of the
                  dynamics, the sparsity of the underlying
                  state-transition matrix, and the expected number of
                  distinct hidden states in a finite sequence. In this
                  framework it is also natural to allow the alphabet
                  of emitted symbols to be infinite --- consider, for
                  example, symbols being possible words appearing in
                  English text.}
}

@inproceedings{BraOrt10,
  cat =		 {rl},
  author =	 {Daniel A.~Braun and Pedro A.~Ortega},
  title =	 {A minimum relative entropy principle for adaptive
                  control in linear quadratic regulators},
  booktitle =	 {Proceedings of the 7th international conference on
                  informatics in control, automation and robotics},
  pages =	 {(in press)},
  year =	 2010,
  url =		 {.},
  abstract =	 {The design of optimal adaptive controllers is
                  usually based on heuristics, because solving
                  Bellman's equations over information states is
                  notoriously intractable. Approximate adaptive
                  controllers often rely on the principle of
                  certainty-equivalence where the control process
                  deals with parameter point estimates as if they
                  represented ``true'' parameter values. Here we
                  present a stochastic control rule instead where
                  controls are sampled from a posterior distribution
                  over a set of probabilistic input-output models and
                  the true model is identified by Bayesian
                  inference. This allows reformulating the adaptive
                  control problem as an inference and sampling problem
                  derived from a minimum relative entropy principle.
                  Importantly, inference and action sampling both work
                  forward in time and hence such a Bayesian adaptive
                  controller is applicable on-line. We demonstrate the
                  improved performance that can be achieved by such an
                  approach for linear quadratic regulator examples.}
}

@inproceedings{BraOrtTheSch11,
  cat =		 {rl},
  author =	 {Daniel A.~Braun and Pedro A.~Ortega and Evangelos
                  Theodorou and Stefan Schaal},
  title =	 {Path Integral Control and Bounded Rationality},
  booktitle =	 {2011 {IEEE} Symposium on Adaptive Dynamic
                  Programming and Reinforcement Learning},
  year =	 2011,
  url =		 {.},
  abstract =	 {Path integral methods have recently been shown to be
                  applicable to a very general class of optimal
                  control problems. Here we examine the path integral
                  formalism from a decision-theoretic point of view,
                  since an optimal controller can always be regarded
                  as an instance of a perfectly rational
                  decision-maker that chooses its actions so as to
                  maximize its expected utility. The problem with
                  perfect rationality is, however, that finding
                  optimal actions is often very difficult due to
                  prohibitive computational resource costs that are
                  not taken into account. In contrast, a bounded
                  rational decision-maker has only limited resources
                  and therefore needs to strike some compromise
                  between the desired utility and the required
                  resource costs. In particular, we suggest an
                  information-theoretic measure of resource costs that
                  can be derived axiomatically. As a consequence we
                  obtain a variational principle for choice
                  probabilities that trades off maximizing a given
                  utility criterion and avoiding resource costs that
                  arise due to deviating from initially given default
                  choice probabilities. The resulting bounded rational
                  policies are in general probabilistic. We show that
                  the solutions found by the path integral formalism
                  are such bounded rational policies. Furthermore, we
                  show that the same formalism generalizes to discrete
                  control problems, leading to linearly solvable
                  bounded rational control policies in the case of
                  Markov systems. Importantly, Bellman's optimality
                  principle is not presupposed by this variational
                  principle, but it can be derived as a limit
                  case. This suggests that the information theoretic
                  formalization of bounded rationality might serve as
                  a general principle in control design that unifies a
                  number of recently reported approximate optimal
                  control methods both in the continuous and discrete
                  domain.},
}

@article{BraOrtWol11,
  author =	 {Daniel A. Braun and Pedro A. Ortega and Daniel
                  M. Wolpert},
  title =	 {Motor coordination: When two have to act as one},
  journal =	 {Special issue of Experimental Brain Research on
                  Joint Action},
  year =	 2011,
  abstract =	 {Trying to pass someone walking toward you in a
                  narrow corridor is a familiar example of a
                  two-person motor game that requires coordination. In
                  this study, we investigate coordination in
                  sensorimotor tasks that correspond to classic
                  coordination games with multiple Nash equilibria,
                  such as "choosing sides", "stag hunt", "chicken",
                  and "battle of sexes". In these tasks, subjects made
                  reaching movements reflecting their continuously
                  evolving "decisions" while they received a
                  continuous payoff in the form of a resistive force
                  counteracting their movements. Successful
                  coordination required two subjects to "choose" the
                  same Nash equilibrium in this force-payoff landscape
                  within a single reach. We found that on the majority
                  of trials coordination was achieved. Compared to the
                  proportion of trials in which miscoordination
                  occurred, successful coordination was characterized
                  by several distinct features: an increased mutual
                  information between the players' movement endpoints,
                  an increased joint entropy during the movements, and
                  by differences in the timing of the players'
                  responses.  Moreover, we found that the probability
                  of successful coordination depends on the players'
                  initial distance from the Nash equilibria. Our
                  results suggest that two-person coordination arises
                  naturally in motor interactions and is facilitated
                  by favorable initial positions, stereotypical motor
                  pattern, and differences in response times.},
  url =		 {http://www.ncbi.nlm.nih.gov/pubmed/21455618}
}

@article{BraOrtWol11b,
  author =	 {Daniel A.~Braun and Pedro A.~Ortega and Daniel
                  M.~Wolpert},
  title =	 {Nash equilibria in multi-agent motor interactions},
  journal =	 {PLoS Computational Biology},
  year =	 2009,
  volume =	 5,
  number =	 8,
  abstract =	 {Social interactions in classic cognitive games like
                  the ultimatum game or the prisoner's dilemma
                  typically lead to Nash equilibria when multiple
                  competitive decision makers with perfect knowledge
                  select optimal strategies. However, in evolutionary
                  game theory it has been shown that Nash equilibria
                  can also arise as attractors in dynamical systems
                  that can describe, for example, the population
                  dynamics of microorganisms. Similar to such
                  evolutionary dynamics, we find that Nash equilibria
                  arise naturally in motor interactions in which
                  players vie for control and try to minimize
                  effort. When confronted with sensorimotor
                  interaction tasks that correspond to the classical
                  prisoner's dilemma and the rope-pulling game,
                  two-player motor interactions led predominantly to
                  Nash solutions. In contrast, when a single player
                  took both roles, playing the sensorimotor game
                  bimanually, cooperative solutions were found. Our
                  methodology opens up a new avenue for the study of
                  human motor interactions within a game theoretic
                  framework, suggesting that the coupling of motor
                  systems can lead to game theoretic solutions.},
  url =		 {http://www.ncbi.nlm.nih.gov/pubmed/19680426}
}

@inproceedings{BraVanVlaGha10,
  cat =		 {np mcmc},
  author =	 {S\'{e}bastien Brati\`{e}res and Jurgen {Van Gael}
                  and Andreas Vlachos and Zoubin Ghahramani},
  abstract =	 {This paper compares parallel and distributed
                  implementations of an iterative, Gibbs sampling,
                  machine learning algorithm. Distributed
                  implementations run under Hadoop on facility
                  computing clouds. The probabilistic model under
                  study is the infinite HMM [1], in which parameters
                  are learnt using an instance blocked Gibbs sampling,
                  with a step consisting of a dynamic program. We
                  apply this model to learn part-of-speech tags from
                  newswire text in an unsupervised fashion. However
                  our focus here is on runtime performance, as opposed
                  to NLP-relevant scores, embodied by iteration
                  duration, ease of development, deployment and
                  debugging.},
  address =	 {Bradford, UK},
  booktitle =	 {Proceedings of the 2010 10th IEEE International
                  Conference on Computer and Information Technology},
  pages =	 {1235--1240},
  title =	 {Scaling the {iHMM}: {P}arallelization versus
                  {H}adoop},
  url =		 {.},
  year =	 2010,
  isbn =	 {978-0-7695-4108-2},
  doi =		 {10.1109/CIT.2010.223},
  publisher =	 {IEEE Computer Society}
}

@article{ChaCunGlo09,
  cat =		 {gp},
  author =	 {C. Chang and J. P. Cunningham and G. Glover},
  title =	 {Influence of heart rate on the BOLD signal: the
                  cardiac response function},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/ChangNeuroimage2009.pdf},
  journal =	 {NeuroImage},
  volume =	 44,
  pages =	 {857-869},
  year =	 2009,
  abstract =	 {It has previously been shown that low-frequency
                  fluctuations in both respiratory volume and cardiac
                  rate can induce changes in the blood-oxygen level
                  dependent (BOLD) signal. Such physiological noise
                  can obscure the detection of neural activation using
                  fMRI, and it is therefore important to model and
                  remove the effects of this noise. While a
                  hemodynamic response function relating respiratory
                  variation (RV) and the BOLD signal has been
                  described, no such mapping for heart rate (HR) has
                  been proposed. In the current study, the effects of
                  RV and HR are simultaneously deconvolved from
                  resting state fMRI. It is demonstrated that a
                  convolution model including RV and HR can explain
                  significantly more variance in gray matter BOLD
                  signal than a model that includes RV alone, and an
                  average HR response function is proposed that well
                  characterizes our subject population. It is observed
                  that the voxel-wise morphology of the deconvolved RV
                  responses is preserved when HR is included in the
                  model, and that its form is adequately modeled by
                  Birn et al.'s previously described respiration
                  response function. Furthermore, it is shown that
                  modeling out RV and HR can significantly alter
                  functional connectivity maps of the default-mode
                  network.}
}

@article{ChuCunKauetal10,
  author =	 {M. M. Churchland and J. P. Cunningham and
                  M. T. Kaufman and S. I. Ryu and K. V. Shenoy.},
  title =	 {Cortical preparatory activity: Representation of
                  movement or first cog in a dynamical machine?},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/ChurchlandNeuron2010.pdf},
  journal =	 {Neuron},
  volume =	 68,
  pages =	 {387-400},
  year =	 2010,
  abstract =	 {The motor cortices are active during both movement
                  and movement preparation. A common assumption is
                  that preparatory activity constitutes a subthreshold
                  form of movement activity: a neuron active during
                  rightward movements becomes modestly active during
                  preparation of a rightward movement. We asked
                  whether this pattern of activity is, in fact,
                  observed. We found that it was not: at the level of
                  a single neuron, preparatory tuning was weakly
                  correlated with movement-period tuning. Yet,
                  somewhat paradoxically, preparatory tuning could be
                  captured by a preferred direction in an abstract
                  "space" that described the population-level pattern
                  of movement activity. In fact, this relationship
                  accounted for preparatory responses better than did
                  traditional tuning models. These results are
                  expected if preparatory activity provides the
                  initial state of a dynamical system whose evolution
                  produces movement activity. Our results thus suggest
                  that preparatory activity may not represent specific
                  factors, and may instead play a more mechanistic
                  role.}
}

@inproceedings{ChuGha09,
  volume =	 5,
  author =	 {W. Chu and Z. Ghahramani},
  note =	 {ISSN 1938-7228},
  booktitle =	 aistats12,
  editor =	 {D. van Dyk and M. Welling},
  title =	 {Probabilistic models for incomplete
                  multi-dimensional arrays},
  address =	 {Clearwater Beach, FL, USA},
  publisher =	 {Microtome Publishing (paper) Journal of Machine
                  Learning Research},
  year =	 2009,
  month =	 {April},
  pages =	 {89--96},
  url =		 {.},
  abstract =	 {In multiway data, each sample is measured by
                  multiple sets of correlated attributes. We develop a
                  probabilistic framework for modeling structural
                  dependency from partially observed multi-dimensional
                  array data, known as pTucker. Latent components
                  associated with individual array dimensions are
                  jointly retrieved while the core tensor is
                  integrated out. The resulting algorithm is capable
                  of handling large-scale data sets. We verify the
                  usefulness of this approach by comparing against
                  classical models on applications to modeling amino
                  acid fluorescence, collaborative filtering and a
                  number of benchmark multiway array data.}
}

@inproceedings{ChuSinGhaetal07,
  cat =		 {gp},
  volume =	 19,
  month =	 {September},
  author =	 {W. Chu and V. Sindhwani and Z. Ghahramani and
                  S. Keerthi},
  series =	 {Bradford Books},
  note =	 {Online contents gives pages 314--321, and 289--296
                  on pdf of contents},
  booktitle =	 nips19,
  editor =	 {B. Sch\"olkopf and J. Platt and T. Hofmann},
  title =	 {Relational learning with {G}aussian processes},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  year =	 2007,
  pages =	 {289--296},
  url =		 {.},
  abstract =	 {Correlation between instances is often modelled via
                  a kernel function using input attributes of the
                  instances. Relational knowledge can further reveal
                  additional pairwise correlations between variables
                  of interest. In this paper, we develop a class of
                  models which incorporates both reciprocal relational
                  information and input attributes using Gaussian
                  process techniques. This approach provides a novel
                  non-parametric Bayesian framework with a
                  data-dependent prior for supervised learning
                  tasks. We also apply this framework to
                  semi-supervised learning. Experimental results on
                  several real world data sets verify the usefulness
                  of this algorithm.}
}

@article{ChuYuCunetal10,
  cat =		 {gp},
  author =	 {M. M. Churchland and B. M. Yu and J. P. Cunningham
                  and L. P. Sugrue and M. R. Cohen and G. S. Corrado
                  and W. T. Newsome and A. M. Clark and P. Hosseini
                  and B. B. Scott and D. C. Bradley and M. A. Smith
                  and A. Kohn and J. A. Movshon and K. M. Armstrong
                  and T. Moore and S. W. Chang and L. H. Snyder and
                  S. G. Lisberger and N. J. Priebe and I. M. Finn and
                  D. Ferster and S. I. Ryu and G. Santhanam and
                  M. Sahani and K. V. Shenoy.},
  title =	 {Stimulus onset quashes neural variability: a
                  widespread cortical phenomenon},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/ChurchlandNN2010.pdf},
  journal =	 {Nature Neuroscience},
  volume =	 13,
  pages =	 {369-378},
  year =	 2010,
  abstract =	 {Neural responses are typically characterized by
                  computing the mean firing rate, but response
                  variability can exist across trials. Many studies
                  have examined the effect of a stimulus on the mean
                  response, but few have examined the effect on
                  response variability. We measured neural variability
                  in 13 extracellularly recorded datasets and one
                  intracellularly recorded dataset from seven areas
                  spanning the four cortical lobes in monkeys and
                  cats. In every case, stimulus onset caused a decline
                  in neural variability. This occurred even when the
                  stimulus produced little change in mean firing
                  rate. The variability decline was observed in
                  membrane potential recordings, in the spiking of
                  individual neurons and in correlated spiking
                  variability measured with implanted 96-electrode
                  arrays. The variability decline was observed for all
                  stimuli tested, regardless of whether the animal was
                  awake, behaving or anaesthetized. This widespread
                  variability decline suggests a rather general
                  property of cortex, that its state is stabilized by
                  an input.}
}

@techreport{Cun08,
  cat =		 {gp approx},
  title =	 {Derivation of {E}xpectation {P}ropagation for "Fast
                  {G}aussian process methods for point process
                  intensity estimation"},
  author =	 {J. P. Cunningham},
  year =	 2008,
  institution =	 {Stanford University},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/CunninghamEPTR2008.pdf},
  abstract =	 {We derive the Expectation Propagation algorithm
                  updates for approximating the posterior distribution
                  on intensity in a conditionally inhomogeneous gamma
                  interval process with a Gaussian Process prior (GP
                  IGIP), a model which appeared in Cunningham, Shenoy,
                  Sahani (2008) ICML.}
}

@article{CunNuyGiletal11,
  cat =		 {time},
  author =	 {J. P. Cunningham and P. Nuyujukian and V. Gilja and
                  C. A. Chestek and S. I. Ryu and K. V. Shenoy.},
  title =	 {A closed-loop human simulator for investigating the
                  role of feedback-control in brain-machine
                  interfaces},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/CunninghamJNP2011.pdf},
  journal =	 {Journal of Neurophysiology},
  volume =	 105,
  pages =	 {1932-1949},
  year =	 2011,
  abstract =	 {Neural prosthetic systems seek to improve the lives
                  of severely disabled people by decoding neural
                  activity into useful behavioral commands. These
                  systems and their decoding algorithms are typically
                  developed "offline", using neural activity
                  previously gathered from a healthy animal, and the
                  decoded movement is then compared with the true
                  movement that accompanied the recorded neural
                  activity. However, this offline design and testing
                  may neglect important features of a real prosthesis,
                  most notably the critical role of feedback control,
                  which enables the user to adjust neural activity
                  while using the prosthesis. We hypothesize that
                  under- standing and optimally designing
                  high-performance decoders require an experimental
                  platform where humans are in closed-loop with the
                  various candidate decode systems and algorithms. It
                  remains unexplored the extent to which the subject
                  can, for a particular decode system, algorithm, or
                  parameter, engage feedback and other strategies to
                  improve decode performance. Closed-loop testing may
                  suggest different choices than offline
                  analyses. Here we ask if a healthy human subject,
                  using a closed-loop neural prosthesis driven by
                  synthetic neural activity, can inform system
                  design. We use this online pros- thesis simulator
                  (OPS) to optimize "online" decode performance based
                  on a key parameter of a current state-of-the-art
                  decode algorithm, the bin width of a Kalman
                  filter. First, we show that offline and online
                  analyses indeed suggest different parameter
                  choices. Previous literature and our offline
                  analyses agree that neural activity should be
                  analyzed in bins of 100- to 300-ms width. OPS
                  analysis, which incorporates feedback control,
                  suggests that much shorter bin widths (25-50 ms)
                  yield higher decode performance. Second, we confirm
                  this surprising finding using a closed-loop rhesus
                  monkey prosthetic system. These findings illustrate
                  the type of discovery made possible by the OPS, and
                  so we hypothesize that this novel testing approach
                  will help in the design of prosthetic systems that
                  will translate well to human patients.}
}

@inproceedings{CunSheSah08,
  cat =		 {gp},
  author =	 {J. P. Cunningham and K. V. Shenoy and M. Sahani},
  booktitle =	 icml25,
  title =	 {Fast {G}aussian process methods for point process
                  intensity estimation},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/CunninghamICML2008.pdf},
  year =	 2008,
  address =	 {Helsinki, Finland},
  month =	 {June},
  pages =	 {1--8},
  abstract =	 {Point processes are difficult to analyze because
                  they provide only a sparse and noisy observation of
                  the intensity function driving the process. Gaussian
                  Processes offer an attractive framework within which
                  to infer underlying intensity functions. The result
                  of this inference is a continuous function defined
                  across time that is typically more amenable to
                  analytical efforts. However, a naive implementation
                  will become computationally infeasible in any
                  problem of reasonable size, both in memory and run
                  time requirements. We demonstrate problem specific
                  methods for a class of renewal processes that
                  eliminate the memory burden and reduce the solve
                  time by orders of magnitude.}
}

@inproceedings{CunYuSheetal08,
  cat =		 {gp},
  author =	 {J. P. Cunningham and B. M. Yu and K. V. Shenoy and
                  M. Sahani},
  booktitle =	 nips20,
  title =	 {Inferring neural firing rates from spike trains
                  using {G}aussian processes},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/CunninghamNIPS2008.pdf},
  year =	 2008,
  address =	 {Vancouver, BC},
  month =	 {December},
  pages =	 {1--8},
  abstract =	 {Neural spike trains present challenges to analytical
                  efforts due to their noisy, spiking nature. Many
                  studies of neuroscientific and neural prosthetic
                  importance rely on a smoothed, denoised estimate of
                  the spike train's underlying firing rate. Current
                  techniques to find time-varying firing rates require
                  ad hoc choices of parameters, offer no confidence
                  intervals on their estimates, and can obscure
                  potentially important single trial variability. We
                  present a new method, based on a Gaussian Process
                  prior, for inferring probabilistically optimal
                  estimates of firing rate functions underlying single
                  or multiple neural spike trains. We test the
                  performance of the method on simulated data and
                  experimentally gathered neural spike trains, and we
                  demonstrate improvements over conventional
                  estimators.},
  annote =	 {Spotlight Presentation}
}

@inproceedings{DavGha11,
  month =	 {August},
  author =	 {Davies, A. and Ghahramani, Z.},
  title =	 {Language-independent {B}ayesian sentiment mining of
                  Twitter},
  year =	 2011,
  booktitle =	 {In {\em The Fifth Workshop on Social Network Mining
                  and Analysis (SNA-KDD 2011)}},
  url =
                  {http://www.alexdavies.net/wordpress/wp-content/uploads/2011/09/Language-Indepedent-Bayesian-Sentiment-Mining-of-Twitter.pdf},
  abstract =	 {This paper outlines a new language-independent model
                  for sentiment analysis of short, social-network
                  statuses. We demonstrate this on data from Twitter,
                  modelling happy vs sad sentiment, and show that in
                  some circumstances this outperforms similar Naive
                  Bayes models by more than 10\%.  We also propose an
                  extension to allow the modelling of differ- ent
                  sentiment distributions in different geographic
                  regions, while incorporating information from
                  neighbouring regions.  We outline the considerations
                  when creating a system analysing Twitter data and
                  present a scalable system of data acquisi- tion and
                  prediction that can monitor the sentiment of tweets
                  in real time.}
}

@inproceedings{DeiHubHan09,
  cat =		 {gp time},
  author =	 {Marc Peter Deisenroth and Marco F. Huber and Uwe
                  D. Hanebeck},
  title =	 {Analytic Moment-based {G}aussian Process Filtering},
  booktitle =	 icml26,
  year =	 2009,
  editor =	 {L\'{e}on Bottou and Michael Littman},
  pages =	 {225--232},
  address =	 {Montr\'{e}al, QC, Canada},
  month =	 {June},
  publisher =	 {Omnipress},
  url =		 {.},
  abstract =	 {We propose an analytic moment-based filter for
                  nonlinear stochastic dynamic systems modeled by
                  Gaussian processes. Exact expressions for the
                  expected value and the covariance matrix are
                  provided for both the prediction step and the filter
                  step, where an additional Gaussian assumption is
                  exploited in the latter case. Our filter does not
                  require further approximations. In particular, it
                  avoids finite-sample approximations. We compare the
                  filter to a variety of Gaussian filters, that is,
                  the EKF, the UKF, and the recent GP-UKF proposed by
                  <a
                  href="http://www.cs.washington.edu/homes/fox/postscripts/gp-ukf-iros-07.pdf">Ko
                  et al. (2007)</a>.},
  annote =	 {With corrections. <a
                  href="http://mlg.eng.cam.ac.uk/marc/downloads/gpadf.zip">code</a>.}
}

@inproceedings{DeiPetRas08,
  cat =		 {gp approx},
  author =	 {Marc Peter Deisenroth and Jan Peters and Carl Edward
                  Rasmussen},
  title =	 {Approximate Dynamic Programming with {G}aussian
                  Processes},
  booktitle =	 {2008 American Control Conference (ACC 2008)},
  year =	 2008,
  pages =	 {4480--4485},
  address =	 {Seattle, WA, USA},
  month =	 {June},
  url =		 {.},
  abstract =	 {In general, it is difficult to determine an optimal
                  closed-loop policy in nonlinear control problems
                  with continuous-valued state and control
                  domains. Hence, approximations are often
                  inevitable. The standard method of discretizing
                  states and controls suffers from the curse of
                  dimensionality and strongly depends on the chosen
                  temporal sampling rate. The paper introduces
                  Gaussian Process Dynamic Programming (GPDP).  In
                  GPDP, value functions in the Bellman recursion of
                  the dynamic programming algorithm are modeled using
                  Gaussian processes. GPDP returns an optimal
                  state-feedback for a finite set of states. Based on
                  these outcomes, we learn a possibly discontinuous
                  closed-loop policy on the entire state space by
                  switching between two independently trained Gaussian
                  processes.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/marc/code/acc2008.zip">code</a>.}
}

@inproceedings{DeiRas09,
  cat =		 {rl},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen},
  title =	 {Bayesian Inference for Efficient Learning in
                  Control},
  booktitle =	 {Multidisciplinary Symposium on Reinforcement
                  Learning},
  year =	 2009,
  address =	 {Montr\'{e}al, QC, Canada},
  month =	 {June},
  url =		 {.},
  abstract =	 {In contrast to humans or animals, artificial
                  learners often require more trials when learning
                  motor control tasks solely based on experience.
                  Efficient autonomous learners will reduce the amount
                  of engineering required to solve control
                  problems. By using probabilistic forward models, we
                  can employ two key ingredients of biological
                  learning systems to speed up artificial learning. We
                  present a consistent and coherent Bayesian framework
                  that allows for efficient autonomous
                  experience-based learning. We demonstrate the
                  success of our learning algorithm by applying it to
                  challenging nonlinear control problems in simulation
                  and in hardware.}
}

@inproceedings{DeiRas09b,
  cat =		 {rl},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen},
  title =	 {Efficient Reinforcement Learning for Motor Control},
  booktitle =	 {10th International PhD Workshop on Systems and
                  Control},
  year =	 2009,
  address =	 {Hlubok\'{a} nad Vltavou, Czech Republic},
  month =	 {September},
  url =		 {.},
  abstract =	 {Artificial learners often require many more trials
                  than humans or animals when learning motor control
                  tasks in the absence of expert knowledge. We
                  implement two key ingredients of biological learning
                  systems, generalization and incorporation of
                  uncertainty into the decision-making process, to
                  speed up artificial learning. We present a coherent
                  and fully Bayesian framework that allows for
                  efficient artificial learning in the absence of
                  expert knowledge. The success of our learning
                  framework is demonstrated on challenging nonlinear
                  control problems in simulation and in hardware.}
}

@inproceedings{DeiRas11,
  cat =		 {gp rl},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen},
  title =	 {{PILCO}: {A} Model-Based and Data-Efficient Approach
                  to Policy Search},
  booktitle =	 icml28,
  year =	 2011,
  abstract =	 {In this paper, we introduce PILCO, a practical,
                  data-efficient model-based policy search
                  method. PILCO reduces model bias, one of the key
                  problems of model-based reinforcement learning, in a
                  principled way. By learning a probabilistic dynamics
                  model and explicitly incorporating model uncertainty
                  into long-term planning, PILCO can cope with very
                  little data and facilitates learning from scratch in
                  only a few trials. Policy evaluation is performed in
                  closed form using state-of-the-art approximate
                  inference. Furthermore, policy gradients are
                  computed analytically for policy improvement. We
                  report unprecedented learning efficiency on
                  challenging and high-dimensional control tasks.},
  url =		 {.},
  annote =	 {<a href="http://mlg.eng.cam.ac.uk/carl/pilco">web
                  site</a>}
}

@inproceedings{DeiRasFox11,
  cat =		 {rl},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen and
                  Dieter Fox},
  title =	 {Learning to Control a Low-Cost Manipulator using
                  Data-Efficient Reinforcement Learning},
  booktitle =	 rss9,
  year =	 2011,
  month =	 {June},
  address =	 {Los Angeles, CA, USA},
  abstract =	 {Over the last years, there has been substantial
                  progress in robust manipulation in unstructured
                  environments. The long-term goal of our work is to
                  get away from precise, but very expensive robotic
                  systems and to develop affordable, potentially
                  imprecise, self-adaptive manipulator systems that
                  can interactively perform tasks such as playing with
                  children. In this paper, we demonstrate how a
                  low-cost off-the-shelf robotic system can learn
                  closed-loop policies for a stacking task in only a
                  handful of trials - from scratch. Our manipulator is
                  inaccurate and provides no pose feedback. For
                  learning a controller in the work space of a
                  Kinect-style depth camera, we use a model-based
                  reinforcement learning technique. Our learning
                  method is data efficient, reduces model bias, and
                  deals with several noise sources in a principled way
                  during long-term planning. We present a way of
                  incorporating state-space constraints into the
                  learning process and analyze the learning gain by
                  exploiting the sequential structure of the stacking
                  task.},
  url =		 {.},
  annote =	 {<a
                  href="http://www.cs.washington.edu/ai/Mobile_Robotics/projects/robot-rl">project
                  site</a>}
}

@inproceedings{DeiRasPet08,
  cat =		 {rl gp},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen and
                  Jan Peters},
  title =	 {Model-Based Reinforcement Learning with Continuous
                  States and Actions},
  booktitle =	 {Proceedings of the 16th European Symposium on
                  Artificial Neural Networks (ESANN 2008)},
  year =	 2008,
  pages =	 {19--24},
  address =	 {Bruges, Belgium},
  month =	 {April},
  url =		 {.},
  abstract =	 {Finding an optimal policy in a reinforcement
                  learning (RL) framework with continuous state and
                  action spaces is challenging. Approximate solutions
                  are often inevitable. GPDP is an approximate dynamic
                  programming algorithm based on Gaussian process (GP)
                  models for the value functions.  In this paper, we
                  extend GPDP to the case of unknown transition
                  dynamics.  After building a GP model for the
                  transition dynamics, we apply GPDP to this model and
                  determine a continuous-valued policy in the entire
                  state space. We apply the resulting controller to
                  the underpowered pendulum swing up. Moreover, we
                  compare our results on this RL task to a nearly
                  optimal discrete DP solution in a fully known
                  environment.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/marc/code/esann2008.zip">code</a>. <a
                  href="http://mlg.eng.cam.ac.uk/marc/talks/2008-04-23-ESANN-bruges.pdf">slides</a>}
}

@article{DeiRasPet09,
  cat =		 {rl gp},
  volume =	 72,
  number =	 {7--9},
  month =	 {March},
  author =	 {Marc Peter Deisenroth and Carl Edward Rasmussen and
                  Jan Peters},
  title =	 {Gaussian process dynamic programming},
  publisher =	 {Elsevier B. V.},
  year =	 2009,
  journal =	 {Neurocomputing},
  pages =	 {1508--1524},
  url =		 {.},
  abstract =	 {Reinforcement learning (RL) and optimal control of
                  systems with continuous states and actions require
                  approximation techniques in most interesting
                  cases. In this article, we introduce Gaussian
                  process dynamic programming (GPDP), an approximate
                  value function-based RL algorithm. We consider both
                  a classic optimal control problem, where
                  problem-specific prior knowledge is available, and a
                  classic RL problem, where only very general priors
                  can be used. For the classic optimal control
                  problem, GPDP models the unknown value functions
                  with Gaussian processes and generalizes dynamic
                  programming to continuous-valued states and
                  actions. For the RL problem, GPDP starts from a
                  given initial state and explores the state space
                  using Bayesian active learning. To design a fast
                  learner, available data have to be used
                  efficiently. Hence, we propose to learn
                  probabilistic models of the a priori unknown
                  transition dynamics and the value functions on the
                  fly. In both cases, we successfully apply the
                  resulting continuous-valued controllers to the
                  under-actuated pendulum swing up and analyze the
                  performances of the suggested algorithms. It turns
                  out that GPDP uses data very efficiently and can be
                  applied to problems, where classic dynamic
                  programming would be cumbersome.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/marc/code/algpdp.zip">code</a>.},
  doi =		 {10.1016/j.neucom.2008.12.019}
}

@mastersthesis{Dos09,
  cat =		 {np},
  author =	 {Finale Doshi-Velez},
  title =	 {The {I}ndian Buffet Process: {S}calable Inference
                  and Extensions},
  school =	 {University of Cambridge},
  year =	 2009,
  address =	 {Cambridge, UK},
  month =	 {August},
  url =		 {.},
  abstract =	 {Many unsupervised learning problems seek to identify
                  hidden features from observations. In many
                  real-world situations, the number of hidden features
                  is unknown. To avoid specifying the number of hidden
                  features a priori, one can use the Indian Buffet
                  Process (IBP): a nonparametric latent feature model
                  that does not bound the number of active features in
                  a dataset. While elegant, the lack of efficient
                  inference procedures for the IBP has prevented its
                  application in large-scale problems.  The core
                  contribution of this thesis are three new inference
                  procedures that allow inference in the IBP to be
                  scaled from a few hundred to 100,000 observations.
                  This thesis contains three parts: (1) An
                  introduction to the IBP and a review of inference
                  techniques and extensions. The first chapters
                  summarise three constructions for the IBP and review
                  all currently published inference techniques.
                  Appendix C reviews extensions of the IBP to date.
                  (2) Novel techniques for scalable Bayesian
                  inference. This thesis presents three new inference
                  procedures: (a) an accelerated Gibbs sampler for
                  efficient Bayesian inference in a broad class of
                  conjugate models, (b) a parallel, asynchronous Gibbs
                  sampler that allows the accelerated Gibbs sampler to
                  be distributed across multiple processors, and (c) a
                  variational inference procedure for the IBP.  (3) A
                  framework for structured nonparametric latent
                  feature models.  We also present extensions to the
                  IBP to model more sophisticated relationships
                  between the co-occurring hidden features, providing
                  a general framework for correlated non-parametric
                  feature models.}
}

@inproceedings{Dos09b,
  author =	 {Finale Doshi-Velez},
  title =	 {The Infinite Partially Observable {M}arkov Decision
                  Process},
  booktitle =	 nips23,
  year =	 2009,
  address =	 {Cambridge, MA, USA},
  month =	 {December},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {The Partially Observable Markov Decision Process
                  (POMDP) framework has proven useful in planning
                  domains where agents must balance actions that
                  provide knowledge and actions that provide
                  reward. Unfortunately, most POMDPs are complex
                  structures with a large number of parameters.  In
                  many real-world problems, both the structure and the
                  parameters are difficult to specify from domain
                  knowledge alone. Recent work in Bayesian
                  reinforcement learning has made headway in learning
                  POMDP models; however, this work has largely focused
                  on learning the parameters of the POMDP model. We
                  define an infinite POMDP (iPOMDP) model that does
                  not require knowledge of the size of the state
                  space; instead, it assumes that the number of
                  visited states will grow as the agent explores its
                  world and only models visited states explicitly. We
                  demonstrate the iPOMDP on several standard
                  problems.}
}

@inproceedings{DosGha09,
  cat =		 {np mcmc},
  booktitle =	 icml26,
  title =	 {Accelerated {Gibbs} sampling for the {Indian} buffet
                  process},
  author =	 {Finale Doshi-Velez and Zoubin Ghahramani},
  year =	 2009,
  address =	 {Montr\'{e}al, QC, Canada},
  month =	 {June},
  publisher =	 {Omnipress},
  pages =	 {273--280},
  editor =	 {L\'{e}on Bottou and Michael Littman},
  url =		 {.},
  abstract =	 {We often seek to identify co-occurring hidden
                  features in a set of observations. The Indian Buffet
                  Process (IBP) provides a non-parametric prior on the
                  features present in each observation, but current
                  inference techniques for the IBP often scale
                  poorly. The collapsed Gibbs sampler for the IBP has
                  a running time cubic in the number of observations,
                  and the uncollapsed Gibbs sampler, while linear, is
                  often slow to mix. We present a new linear-time
                  collapsed Gibbs sampler for conjugate likelihood
                  models and demonstrate its efficacy on large
                  real-world datasets.}
}

@inproceedings{DosGha09b,
  cat =		 {np},
  booktitle =	 {Conference on Uncertainty in Artificial Intelligence
                  (UAI 2009)},
  title =	 {Correlated non-parametric latent feature models},
  author =	 {F. Doshi-Velez and Z. Ghahramani},
  year =	 2009,
  address =	 {Montr\'{e}al, QC, Canada},
  month =	 {June},
  pages =	 {143--150},
  publisher =	 {AUAI Press},
  url =		 {.},
  abstract =	 {We are often interested in explaining data through a
                  set of hidden factors or features.  To allow for an
                  unknown number of such hidden features, one can use
                  the IBP: a non-parametric latent feature model that
                  does not bound the number of active features in a
                  dataset. However, the IBP assumes that all latent
                  features are uncorrelated, making it inadequate for
                  many real-world problems.  We introduce a framework
                  for correlated non-parametric feature models,
                  generalising the IBP. We use this framework to
                  generate several specific models and demonstrate
                  applications on real-world datasets.}
}

@inproceedings{DosGha11,
  cat =		 {rl},
  author =	 {Finale Doshi-Velez and Zoubin Ghahramani},
  year =	 2011,
  title =	 {A Comparison of Human and Agent Reinforcement
                  Learning in Partially Observable Domains},
  booktitle =	 {33rd Annual Meeting of the Cognitive Science
                  Society},
  address =	 {Boston, MA},
  url =		 {.},
  abstract =	 {It is commonly stated that reinforcement learning
                  (RL) algorithms learn slower than humans. In this
                  work, we investigate this claim using two standard
                  problems from the RL literature. We compare the
                  performance of human subjects to RL techniques. We
                  find that context---the meaningfulness of the
                  observations—--plays a significant role in the rate
                  of human RL. Moreover, without contextual
                  information, humans often fare much worse than
                  classic algorithms. Comparing the detailed responses
                  of humans and RL algorithms, we also find that
                  humans appear to employ rather different strategies
                  from standard algorithms, even in cases where they
                  had indistinguishable performance to them. Our
                  research both sheds light on human RL and provides
                  insights for improving RL algorithms.}
}

@inproceedings{DosKnoMohGha09,
  cat =		 {np approx},
  url =		 {.},
  author =	 {Finale Doshi-Velez and David Knowles and Shakir
                  Mohamed and Zoubin Ghahramani},
  title =	 {Large Scale Non-parametric Inference: {D}ata
                  Parallelisation in the {I}ndian Buffet Process},
  booktitle =	 nips23,
  pages =	 {1294--1302},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  year =	 2009,
  month =	 {December},
  abstract =	 {Nonparametric Bayesian models provide a framework
                  for flexible probabilistic modelling of complex
                  datasets. Unfortunately, the high-dimensional
                  averages required for Bayesian methods can be slow,
                  especially with the unbounded representations used
                  by nonparametric models. We address the challenge of
                  scaling Bayesian inference to the increasingly large
                  datasets found in real-world applications.  We focus
                  on parallelisation of inference in the Indian Buffet
                  Process (IBP), which allows data points to have an
                  unbounded number of sparse latent features. Our
                  novel MCMC sampler divides a large data set between
                  multiple processors and uses message passing to
                  compute the global likelihoods and posteriors.  This
                  algorithm, the first parallel inference scheme for
                  IBP-based models, scales to datasets orders of
                  magnitude larger than have previously been
                  possible.}
}

@inproceedings{DosMilVanTeh09,
  cat =		 {np},
  author =	 {Doshi-Velez, F. and Miller, K.T. and {Van Gael},
                  J. and Teh, Y.W.},
  booktitle =	 aistats12,
  keywords =	 {Factor Analysis,IBP,Variational},
  mendeley-tags ={Factor Analysis,IBP,Variational},
  pages =	 {137--144},
  publisher =	 jmlr,
  address =	 {Clearwater Beach, FL, USA},
  title =	 {Variational inference for the {I}ndian buffet
                  process},
  url =		 {.},
  volume =	 12,
  year =	 2009,
  month =	 {April},
  abstract =	 {The Indian Buffet Process (IBP) is a nonparametric
                  prior for latent feature models in which
                  observations are influenced by a combination of
                  hidden features. For example, images may be composed
                  of several objects and sounds may consist of several
                  notes. Latent feature models seek to infer these
                  unobserved features from a set of observations; the
                  IBP provides a principled prior in situations where
                  the number of hidden features is unknown. Current
                  inference methods for the IBP have all relied on
                  sampling. While these methods are guaranteed to be
                  accurate in the limit, samplers for the IBP tend to
                  mix slowly in practice. We develop a deterministic
                  variational method for inference in the IBP based on
                  a truncated stick-breaking approximation, provide
                  theoretical bounds on the truncation error, and
                  evaluate our method in several data regimes.}
}

@techreport{DosMilVanTeh09b,
  cat =		 {np},
  author =	 {Finale Doshi-Velez and Kurt T. Miller and Jurgen
                  {Van Gael} and Yee Whye Teh},
  title =	 {Variational Inference for the {I}ndian Buffet
                  Process},
  institution =	 {University of Cambridge},
  year =	 2009,
  number =	 {CBL-2009-001},
  address =	 {Computational and Biological Learning Laboratory,
                  Department of Engineering},
  month =	 {April},
  url =		 {.},
  abstract =	 {The Indian Buffet Process (IBP) is a nonparametric
                  prior for latent feature models in which
                  observations are influenced by a combination of
                  hidden features. For example, images may be composed
                  of several objects and sounds may consist of several
                  notes. Latent feature models seek to infer these
                  unobserved features from a set of observations; the
                  IBP provides a principled prior in situations where
                  the number of hidden features is unknown. Current
                  inference methods for the IBP have all relied on
                  sampling. While these methods are guaranteed to be
                  accurate in the limit, samplers for the IBP tend to
                  mix slowly in practice. We develop a deterministic
                  variational method for inference in the IBP based on
                  truncating to infinite models, provide theoretical
                  bounds on the truncation error, and evaluate our
                  method in several data regimes. This technical
                  report is a longer version of Doshi-Velez et
                  al. (2009).}
}

@article{DosRoy08,
  author =	 {Finale Doshi and Nicholas Roy},
  title =	 {Spoken Language Interaction with Model Uncertainty:
                  {A}n Adaptive Human-Robot Interaction System},
  journal =	 {Connection Science},
  year =	 2008,
  volume =	 20,
  pages =	 {290--318},
  number =	 4,
  month =	 {December},
  url =		 {.},
  abstract =	 {Spoken language is one of the most intuitive forms
                  of interaction between humans and
                  agents. Unfortunately, agents that interact with
                  people using natural language often experience
                  communication errors and do not correctly understand
                  the user's intentions. Recent systems have
                  successfully used probabilistic models of speech,
                  language, and user behavior to generate robust
                  dialog performance in the presence of noisy speech
                  recognition and ambiguous language choices, but
                  decisions made using these probabilistic models are
                  still prone to errors due to the complexity of
                  acquiring and maintaining a complete model of human
                  language and behavior.  In this paper, we describe a
                  decision-theoretic model for human-robot interaction
                  using natural language. Our algorithm is based on
                  the Partially Observable Markov Decision Process
                  (POMDP), which allows agents to choose actions that
                  are robust not only to uncertainty from noisy or
                  ambiguous speech recognition but also unknown user
                  models. Like most dialog systems, a POMDP is defined
                  by a large number of parameters that may be
                  difficult to specify a priori from domain knowledge,
                  and learning these parameters from the user may
                  require an unacceptably long training period. We
                  describe an extension to the POMDP model that allows
                  the agent to acquire a linguistic model of the user
                  online, including new vocabulary and word choice
                  preferences.  Our approach not only avoids a
                  training period of constant questioning as the agent
                  learns, but also allows the agent to actively query
                  for additional information when its uncertainty
                  suggests a high risk of mistakes. We demonstrate our
                  approach both in simulation and on a natural
                  language interaction system for a robotic wheelchair
                  application.}
}

@inproceedings{DubHwaRanetal04,
  cat =		 {np bioinf clust},
  author =	 {A.~Dubey and S.~Hwang and C.~Rangel and Carl Edward
                  Rasmussen and Zoubin Ghahramani and David L.~Wild},
  title =	 {Clustering Protein Sequence and Structure Space with
                  Infinite {G}aussian Mixture Models},
  year =	 2004,
  publisher =	 {World Scientific Publishing},
  pages =	 {399--410},
  journal =	 {Pacific Symposium on Biocomputing 2004; Vol. 9},
  address =	 {Singapore},
  abstract =	 {We describe a novel approach to the problem of
                  automatically clustering protein sequences and
                  discovering protein families, subfamilies etc.,
                  based on the thoery of infinite Gaussian mixture
                  models. This method allows the data itself to
                  dictate how many mixture components are required to
                  model it, and provides a measure of the probability
                  that two proteins belong to the same cluster. We
                  illustrate our methods with application to three
                  data sets: globin sequences, globin sequences with
                  known tree-dimensional structures and G-pretein
                  coupled receptor sequences. The consistency of the
                  clusters indicate that that our methods is producing
                  biologically meaningful results, which provide a
                  very good indication of the underlying families and
                  subfamilies. With the inclusion of secondary
                  structure and residue solvent accessibility
                  information, we obtain a classification of sequences
                  of known structure which reflects and extends their
                  SCOP classifications.},
  url =		 {.},
  booktitle =	 {Pacific Symposium on Biocomputing 2004},
  location =	 {The Big Island of Hawaii}
}

@inproceedings{DuvNicRas11,
  cat =		 {gp},
  booktitle =	 nips25,
  title =	 {Additive {G}aussian Processes},
  author =	 {David Duvenaud and Hannes Nickisch and Carl Edward
                  Rasmussen},
  year =	 2011,
  address =	 {Granada, Spain},
  month =	 {December},
  pages =	 {1--8},
  url =
                  {http://mlg.eng.cam.ac.uk/duvenaud/papers/additive_preprint.pdf},
  abstract =	 {We introduce a Gaussian process model of functions
                  which are $\textitadditive$.  An additive function
                  is one which decomposes into a sum of
                  low-dimensional functions, each depending on only a
                  subset of the input variables. Additive GPs
                  generalize both Generalized Additive Models, and the
                  standard GP models which use squared-exponential
                  kernels.  Hyperparameter learning in this model can
                  be seen as Bayesian Hierarchical Kernel Learning
                  (HKL).  We introduce an expressive but tractable
                  parameterization of the kernel function, which
                  allows efficient evaluation of all input interaction
                  terms, whose number is exponential in the input
                  dimension.  The additional structure discoverable by
                  this model results in increased interpretability, as
                  well as state-of-the-art predictive power in
                  regression tasks.}
}

@inproceedings{EatGha09,
  cat =		 {gm},
  author =	 {Frederik Eaton and Zoubin Ghahramani},
  title =	 {Choosing a Variable to Clamp: {A}pproximate
                  Inference Using Conditioned Belief Propagation},
  booktitle =	 aistats12,
  pages =	 {145--152},
  year =	 2009,
  editor =	 {D. van Dyk and M. Welling},
  volume =	 5,
  address =	 {Clearwater Beach, FL, USA},
  month =	 {April},
  publisher =	 jmlr,
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/frederik/libdai-eaton.tar.gz">Code</a>
                  (in C++ based on <a
                  href="http://www.kyb.mpg.de/bs/people/jorism/libDAI">libDAI</a>).},
  abstract =	 {In this paper we propose an algorithm for
                  approximate inference on graphical models based on
                  belief propagation (BP). Our algorithm is an
                  approximate version of Cutset Conditioning, in which
                  a subset of variables is instantiated to make the
                  rest of the graph singly connected. We relax the
                  constraint of single-connectedness, and select
                  variables one at a time for conditioning, running
                  belief propagation after each selection. We consider
                  the problem of determining the best variable to
                  clamp at each level of recursion, and propose a fast
                  heuristic which applies back-propagation to the BP
                  updates.  We demonstrate that the heuristic performs
                  better than selecting variables at random, and give
                  experimental results which show that it performs
                  competitively with existing approximate inference
                  algorithms.}
}

@inproceedings{EicTolZieetal04,
  author =	 {Jan Eichhorn and Andreas S.~Tolias and Alexander
                  Zien and Malte Ku{\ss} and Carl Edward Rasmussen and
                  Jason Weston and Nikos K.~Logothetis and Bernhard
                  Sch{\"o}lkopf},
  title =	 {Prediction on Spike Data Using Kernel Algorithms},
  year =	 2004,
  volume =	 16,
  publisher =	 mit,
  pages =	 {1367--1374},
  editor =	 {Sebastian Thrun and Lawrence K.~Saul and Bernhard
                  Sch{\"o}lkopf},
  address =	 {Cambridge, MA, USA},
  abstract =	 {We report and compare the performance of different
                  learning algorithms based on data from cortical
                  recordings. The task is to predict the orientation
                  of visual stimuli from the activity of a population
                  of simultaneously recorded neurons. We compare
                  several ways of improving the coding of the input
                  (i.e., the spike data) as well as of the output
                  (i.e., the orientation), and report the results
                  obtained using different kernel algorithms.},
  booktitle =	 nips16,
  location =	 {Vancouver, BC, Canada},
  URL =		 {.}
}

@inproceedings{FraKwoRasSch04,
  cat =		 {ssl},
  author =	 {Matthias O.~Franz and Younghee Kwon and Carl Edward
                  Rasmussen and Bernhard Sch{\"o}lkopf},
  title =	 {Semi-supervised kernel regression using whitened
                  function classes},
  year =	 2004,
  volume =	 3175,
  booktitle =	 lncs,
  publisher =	 {Springer},
  pages =	 {18--26},
  journal =	 {Pattern Recognition, Proceedings of the 26th {DAGM}
                  Symposium},
  editor =	 {C.~E.~Rasmussen and H.~H.~B{\"u}lthoff and
                  M.~A.~Giese and B.~Sch{\"o}lkopf},
  address =	 {Berlin, Germany},
  abstract =	 {The use of non-orthonormal basis functions in ridge
                  regression leads to an often undesired non-isotropic
                  prior in function space. In this study, we
                  investigate an alternative regularization technique
                  that results in an implicit whitening of the basis
                  functions by penalizing directions in function space
                  with a large prior variance. The regularization term
                  is computed from unlabelled input data that
                  characterizes the input distribution. Tests on two
                  datasets using polynomial basis functions showed an
                  improved average performance compared to standard
                  ridge regression.},
  url =		 {.}
}

@inproceedings{GhaGriSol07,
  cat =		 {np},
  month =	 {July},
  author =	 {Z. Ghahramani and T.L. Griffiths and P. Sollich},
  annote =	 {Includes discussion by David Dunson, and rejoinder.},
  booktitle =	 {Bayesian Statistics 8},
  editor =	 {J.M. Bernardo and M.J. Bayarri and J.O. Berger and
                  A.P. Dawid and D. Heckerman and A.F.M. Smith and
                  M. West},
  address =	 {Oxford, UK},
  title =	 {Bayesian nonparametric latent feature models (with
                  discussion)},
  publisher =	 oup,
  pages =	 {201--226},
  year =	 2007,
  url =		 {.},
  abstract =	 {We describe a flexible nonparametric approach to
                  latent variable modelling in which the number of
                  latent variables is unbounded. This approach is
                  based on a probability distribution over equivalence
                  classes of binary matrices with a finite number of
                  rows, corresponding to the data points, and an
                  unbounded number of columns, corresponding to the
                  latent variables. Each data point can be associated
                  with a subset of the possible latent variables,
                  which we refer to as the latent features of that
                  data point. The binary variables in the matrix
                  indicate which latent feature is possessed by which
                  data point, and there is a potentially infinite
                  array of features. We derive the distribution over
                  unbounded binary matrices by taking the limit of a
                  distribution over N&times;K binary matrices as
                  K&rarr;&infin;. We define a simple generative
                  processes for this distribution which we call the
                  Indian buffet process (IBP; Griffiths and
                  Ghahramani, 2005, <a href="#GriGha06">2006</a>) by
                  analogy to the Chinese restaurant process (Aldous,
                  1985; Pitman, 2002). The IBP has a single
                  hyperparameter which controls both the number of
                  feature per ob ject and the total number of
                  features. We describe a two-parameter generalization
                  of the IBP which has additional flexibility,
                  independently controlling the number of features per
                  object and the total number of features in the
                  matrix. The use of this distribution as a prior in
                  an infinite latent feature model is illustrated, and
                  Markov chain Monte Carlo algorithms for inference
                  are described.}
}

@inproceedings{GhaHel06,
  cat =		 {ir clust},
  author =	 {Zoubin Ghahramani and Katherine A. Heller},
  title =	 {{B}ayesian Sets},
  booktitle =	 nips18,
  year =	 2006,
  month =	 {December},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  editor =	 {Y. Weiss and B. Sch\"{o}lkopf and J. Platt},
  pages =	 {435--442},
  url =		 {.},
  abstract =	 {Inspired by "Google&trade; Sets", we consider the
                  problem of retrieving items from a concept or
                  cluster, given a query consisting of a few items
                  from that cluster. We formulate this as a Bayesian
                  inference problem and describe a very simple
                  algorithm for solving it. Our algorithm uses a
                  model-based concept of a cluster and ranks items
                  using a score which evaluates the marginal
                  probability that each item belongs to a cluster
                  containing the query items. For exponential family
                  models with conjugate priors this marginal
                  probability is a simple function of sufficient
                  statistics. We focus on sparse binary data and show
                  that our score can be evaluated exactly using a
                  single sparse matrix multiplication, making it
                  possible to apply our algorithm to very large
                  datasets. We evaluate our algorithm on three
                  datasets: retrieving movies from EachMovie, finding
                  completions of author sets from the NIPS dataset,
                  and finding completions of sets of words appearing
                  in the Grolier encyclopedia. We compare to
                  Google&trade; Sets and show that Bayesian Sets gives
                  very reasonable set completions.}
}

@article{GibSaa08,
  author =	 {Richard J. Gibbens and Yunus Saat\c{c}i},
  title =	 {Data, modelling and inference in road traffic
                  networks},
  journal =	 {Philosophical Transactions of the Royal Society A:
                  Mathematical, Physical and Engineering Sciences},
  year =	 2008,
  volume =	 366,
  pages =	 {1907--1919},
  number =	 1872,
  month =	 {June},
  abstract =	 {In this paper, we study UK road traffic data and
                  explore a range of modelling and inference questions
                  that arise from them. For example, loop detectors on
                  the M25 motorway record speed and flow measurements
                  at regularly spaced locations as well as the entry
                  and exit lanes of junctions. An exploratory study of
                  these data helps us to better understand and
                  quantify the nature of congestion on the road
                  network.  From a traveller's perspective it is
                  crucially important to understand the overall
                  journey times and we look at methods to improve our
                  ability to predict journey times given access
                  jointly to both real-time and historical loop
                  detector data. Throughout this paper we will comment
                  on related work derived from US freeway data.},
  doi =		 {10.1098/rsta.2008.0020},
  eprint =
                  {http://rsta.royalsocietypublishing.org/content/366/1872/1907.full.pdf+html},
  url =
                  {http://rsta.royalsocietypublishing.org/content/366/1872/1907.abstract}
}

@inproceedings{GirRasQuiMur03,
  cat =		 {gp time},
  author =	 {Agathe Girard and Carl Edward Rasmussen and Joaquin
                  Qui{\~n}onero-Candela and Roderick Murray-Smith},
  title =	 {Gaussian Process priors with uncertain inputs ---
                  application to multiple-step ahead time series
                  forecasting},
  booktitle =	 nips15,
  pages =	 {529--536},
  year =	 2003,
  month =	 {December},
  address =	 {Cambridge, MA, USA},
  editor =	 {S.~Becker and S.~Thrun and K.~Obermayer},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {We consider the problem of multi-step ahead
                  prediction in time series analysis using the
                  non-parametric Gaussian process model. k-step ahead
                  forecasting of a discrete-time non-linear dynamic
                  system can be performed by doing repeated one-step
                  ahead predictions. For a state-space model of the
                  form y<sub>t</sub> =
                  f(y<sub>t-1</sub>,...,y<sub>t-L</sub>), the
                  prediction of y at time t + k is based on the point
                  estimates of the previous outputs. In this paper, we
                  show how, using an analytical Gaussian
                  approximation, we can formally incorporate the
                  uncertainty about intermediate regressor values,
                  thus updating the uncertainty on the current
                  prediction.}
}

@article{GlaParKnoetal10,
  cat =		 {bioinf},
  author =	 {Daniel Glass and Leopold Parts and David A. Knowles
                  and Abraham Aviv and and Tim D. Spector},
  year =	 2010,
  title =	 {No Correlation Between Childhood Maltreatment and
                  Telomere Length.},
  journal =	 {Biological Psychiatry},
  volume =	 68,
  number =	 6,
  pages =	 {21--22},
  abstract =	 {Telomeres are lengths of repetitive DNA that cap the
                  ends of chromosomes. They protect the ends of the
                  chromosome and shorten with each cell
                  division. Short leukocyte telomere length has been
                  related to a number of age-related diseases. In
                  addition, shorter telomere length has been
                  associated with environmental factors such as
                  smoking and lack of exercise. In a recent issue of
                  Biological Psychiatry, Tyrka et al. (4) published a
                  report suggesting a link between maltreatment in
                  childhood and telomere shortening in 31
                  subjects. Individuals who had suffered maltreatment
                  had telomere length .70 +/- .24 compared with 1.02
                  +/- .52 in individuals who had not been abused.},
}

@inproceedings{GoeJaeRas06,
  cat =		 {np},
  author =	 {Dilan G{\"o}r{\"u}r and Frank J{\"a}kel and Carl
                  Edward Rasmussen},
  title =	 {A Choice Model with Infinitely Many Latent Features},
  year =	 2006,
  publisher =	 {ACM Press},
  pages =	 {361--368},
  month =	 {June},
  booktitle =	 icml23,
  editor =	 {W.~W.~Cohen and Andrew Moore},
  address =	 {New York, NY, USA},
  doi =		 {10.1145/1143844.1143890},
  abstract =	 {Elimination by aspects (EBA) is a probabilistic
                  choice model describing how humans decide between
                  several options.  The options from which the choice
                  is made are characterized by binary features and
                  associated weights. For instance, when choosing
                  which mobile phone to buy the features to consider
                  may be: long lasting battery, color screen,
                  etc. Existing methods for inferring the parameters
                  of the model assume pre-specified features. However,
                  the features that lead to the observed choices are
                  not always known.  Here, we present a non-parametric
                  Bayesian model to infer the features of the options
                  and the corresponding weights from choice data. We
                  use the Indian buffet process (IBP) as a prior over
                  the features. Inference using Markov chain Monte
                  Carlo (MCMC) in conjugate IBP models has been
                  previously described. The main contribution of this
                  paper is an MCMC algorithm for the EBA model that
                  can also be used in inference for other
                  non-conjugate IBP models---this may broaden the use
                  of IBP priors considerably.},
  location =	 {Pittsburgh, PA, USA},
  url =		 {.}
}

@article{GoeRas10,
  cat =		 {np},
  title =	 {{D}irichlet Process {G}aussian Mixture Models:
                  Choice of the base distribution},
  author =	 {Dilan G{\"o}r{\"u}r and Carl Edward Rasmussen},
  journal =	 jcst,
  publisher =	 {Science Press},
  address =	 {Beijing, China},
  volume =	 25,
  pages =	 {615--625},
  month =	 {July},
  year =	 2010,
  number =	 4,
  url =		 {.},
  doi =		 {10.1007/s11390-010-9355-8},
  abstract =	 {In the Bayesian mixture modeling framework it is
                  possible to infer the necessary number of components
                  to model the data and therefore it is unnecessary to
                  explicitly restrict the number of
                  components. Nonparametric mixture models sidestep
                  the problem of finding the "correct" number of
                  mixture components by assuming infinitely many
                  components. In this paper Dirichlet process mixture
                  (DPM) models are cast as infinite mixture models and
                  inference using Markov chain Monte Carlo is
                  described. The specification of the priors on the
                  model parameters is often guided by mathematical and
                  practical convenience. The primary goal of this
                  paper is to compare the choice of conjugate and
                  non-conjugate base distributions on a particular
                  class of DPM models which is widely used in
                  applications, the Dirichlet process Gaussian mixture
                  model (DPGMM). We compare computational efficiency
                  and modeling performance of DPGMM defined using a
                  conjugate and a conditionally conjugate base
                  distribution. We show that better density models can
                  result from using a wider class of priors with no or
                  only a modest increase in computational effort.}
}

@inproceedings{GoeRasToletal04,
  author =	 {Dilan G{\"o}r{\"u}r and Carl Edward Rasmussen and
                  Andreas S.~Tolias and Fabian Sinz and Nikos
                  K.~Logothetis},
  title =	 {Modelling Spikes with Mixtures of Factor Analysers},
  year =	 2004,
  series =	 lncs,
  publisher =	 {Springer},
  pages =	 {391--398},
  month =	 09,
  volume =	 3175,
  journal =	 {Pattern Recognition: Proceedings of the 26th DAGM
                  Symposium},
  editor =	 {C.~E.~Rasmussen and H.~H.~B{\"u}lthoff and
                  B.~Sch{\"o}lkopf and M.~A.~Giese},
  address =	 {Berlin, Germany},
  abstract =	 {Identifying the action potentials of individual
                  neurons from extracellular recordings, known as
                  spike sorting, is a challenging problem. We consider
                  the spike sorting problem using a generative
                  model,mixtures of factor analysers, which
                  concurrently performs clustering and feature
                  extraction. The most important advantage of this
                  method is that it quantifies the certainty with
                  which the spikes are classified. This can be used as
                  a means for evaluating the quality of clustering and
                  therefore spike isolation. Using this method, nearly
                  simultaneously occurring spikes can also be modelled
                  which is a hard task for many of the spike sorting
                  methods. Furthermore, modelling the data with a
                  generative model allows us to generate simulated
                  data.},
  booktitle =	 {DAGM 2004},
  location =	 {T{\"u}bingen, Germany},
  url =		 {.}
}

@inproceedings{GolAndVanSetetal06,
  author =	 {Goldberg, A. B. and Andrzejewski, D. and {Van Gael},
                  J. and Settles, B. and Zhu, X. and Craven, M.},
  shorttitle =	 {Ranking Biomedical Passages for Relevance and Dive},
  title =	 {Ranking Biomedical Passages for Relevance and
                  Diversity: {U}niversity of {W}isconsin, {M}adison at
                  {TREC} {G}enomics 2006},
  booktitle =	 {Proceedings of the Fifteenth Text REtrieval
                  Conference (TREC 2006)},
  url =		 {.},
  year =	 2006,
  month =	 {November},
  address =	 {Gaithersburg, MD, USA},
  abstract =	 {We report on the University of Wisconsin, Madison's
                  experience in the TREC Genomics 2006 track, which
                  asks participants to retrieve passages from
                  scientific articles that satisfy biologists'
                  information needs. An emphasis is placed on
                  returning relevant passages that discuss different
                  aspects of the topic. Using an off-the-shelf
                  information retrieval (IR) engine, we focused on
                  query generation and reranking query results to
                  encourage relevance and diversity. For query
                  generation, we automatically identify noun phrases
                  from the topic descriptions, and use online
                  resources to gather synonyms as expansion terms. Our
                  first submission uses the baseline IR engine
                  results. We rerank the passages using a naive
                  clustering-based approach in our second run, and we
                  test GRASSHOPPER, a novel graph-theoretic algorithm
                  based on absorbing random walks, in our third
                  run. While our aspect-level results appear to
                  compare favorably with other participants on
                  average, our query generation techniques failed to
                  produce adequate query results for several topics,
                  causing our passage and document-level evaluation
                  scores to suffer.  Furthermore, we surprisingly
                  achieved higher aspect-level scores using the
                  initial ranking than our methods aimed specifically
                  at promoting diversity. While this sounds
                  discouraging, we have several ideas as to why this
                  happened and hope to produce new methods that
                  correct these shortcomings.}
}

@inproceedings{GolZhuVanAnd07,
  author =	 {Goldberg, A.B. and Zhu, X. and {Van Gael}, J. and
                  Andrzejewski, D.},
  booktitle =	 {Proceedings of NAACL HLT},
  pages =	 {97--104},
  address =	 {Rochester, NY, USA},
  title =	 {Improving diversity in ranking using absorbing
                  random walks},
  url =		 {.},
  year =	 2007,
  month =	 {April}
}

@inproceedings{GriGha06,
  cat =		 {np},
  booktitle =	 nips18,
  year =	 2006,
  month =	 {December},
  title =	 {Infinite Latent Feature Models and the {I}ndian
                  {B}uffet {P}rocess},
  author =	 {T. L. Griffiths and Z. Ghahramani},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  editor =	 {Y. Weiss and B. Sch\"{o}lkopf and J. Platt},
  pages =	 {475--482},
  url =		 {.},
  abstract =	 {We define a probability distribution over
                  equivalence classes of binary matrices with a finite
                  number of rows and an unbounded number of
                  columns. This distribution is suitable for use as a
                  prior in probabilistic models that represent objects
                  using a potentially infinite array of features. We
                  identify a simple generative process that results in
                  the same distribution over equivalence classes,
                  which we call the Indian buffet process. We
                  illustrate the use of this distribution as a prior
                  in an infinite latent feature model, deriving a
                  Markov chain Monte Carlo algorithm for inference in
                  this model and applying the algorithm to an image
                  dataset.}
}

@article{GriGha11,
  cat =		 {np review},
  volume =	 12,
  month =	 {April},
  author =	 {Thomas L. Griffiths and Zoubin Ghahramani},
  title =	 {The {Indian} buffet process: {An} introduction and
                  review},
  journal =	 jmlr,
  pages =	 {1185--1224},
  year =	 2011,
  url =		 {.},
  abstract =	 {The Indian buffet process is a stochastic process
                  defining a probability distribution over equivalence
                  classes of sparse binary matrices with a finite
                  number of rows and an unbounded number of
                  columns. This distribution is suitable for use as a
                  prior in probabilistic models that represent objects
                  using a potentially infinite array of features, or
                  that involve bipartite graphs in which the size of
                  at least one class of nodes is unknown. We give a
                  detailed derivation of this distribution, and
                  illustrate its use as a prior in an infinite latent
                  feature model. We then review recent applications of
                  the Indian buffet process in machine learning,
                  discuss its extensions, and summarize its
                  connections to other stochastic processes.}
}

@inproceedings{GuaDyNiuetal10,
  cat =		 {np clust},
  author =	 {Y. Guan and J. G. Dy and D. Niu and Z. Ghahramani},
  title =	 {Variational inference for nonparametric multiple
                  clustering},
  booktitle =	 {KDD10 Workshop on Discovering, Summarizing, and
                  Using Multiple Clusterings},
  address =	 {Washington, DC, USA},
  year =	 2010,
  month =	 {July},
  url =		 {.},
  abstract =	 {Most clustering algorithms produce a single
                  clustering solution.  Similarly, feature selection
                  for clustering tries to find one feature subset
                  where one interesting clustering solution
                  resides. However, a single data set may be
                  multi-faceted and can be grouped and interpreted in
                  many different ways, especially for high dimensional
                  data, where feature selection is typically
                  needed. Moreover, different clustering solutions are
                  interesting for different purposes. Instead of
                  committing to one clustering solution, in this paper
                  we introduce a probabilistic nonparametric Bayesian
                  model that can discover several possible clustering
                  solutions and the feature subset views that
                  generated each cluster partitioning
                  simultaneously. We provide a variational inference
                  approach to learn the features and clustering
                  partitions in each view. Our model allows us not
                  only to learn the multiple clusterings and views but
                  also allows us to automatically learn the number of
                  views and the number of clusters in each view.}
}

@inproceedings{HalRasMac11,
  cat =		 {rl},
  author =	 {Joseph Hall and Carl Edward Rasmussen and Jan
                  Maciejowski},
  title =	 {Reinforcement Learning with Reference Tracking
                  Control in Continuous State Spaces},
  url =		 {.},
  booktitle =	 {Proceedings of 50th IEEE Conference on Decision and
                  Control and European Control Conference},
  year =	 2011,
  abstract =	 {The contribution described in this paper is an
                  algorithm for learning nonlinear, reference
                  tracking, control policies given no prior knowledge
                  of the dynamical system and limited interaction with
                  the system through the learning process. Concepts
                  from the field of reinforcement learning, Bayesian
                  statistics and classical control have been brought
                  together in the formulation of this algorithm which
                  can be viewed as a form indirect self tuning
                  regulator. On the task of reference tracking using
                  the inverted pendulum it was shown to yield
                  generally improved performance on the best
                  controller derived from the standard linear
                  quadratic method using only 30 s of total
                  interaction with the system. Finally, the algorithm
                  was shown to work on the double pendulum proving its
                  ability to solve nontrivial control tasks.}
}

@article{HanRas94,
  cat =		 {approx},
  author =	 {Lars Kai Hansen and Carl Edward Rasmussen},
  title =	 {Pruning from adaptive regularization},
  journal =	 nc,
  number =	 6,
  volume =	 6,
  pages =	 {1222--1231},
  year =	 1994,
  abstract =	 {Inspired by the recent upsurge of interest in
                  Bayesian methods we consider adaptive
                  regularization. A generalization based scheme for
                  adaptation of regularization parameters is
                  introduced and compared to Bayesian
                  regularization. We show that pruning arises
                  naturally within both adaptive regularization
                  schemes. As model example we have chosen the
                  simplest possible: estimating the mean of a random
                  variable with known variance. Marked similarities
                  are found between the two methods in that they both
                  involve a "noise limit", below which they regularize
                  with infinite weight decay, i.e., they
                  prune. However, pruning is not always beneficial. We
                  show explicitly that both methods in some cases may
                  increase the generalization error. This corresponds
                  to situations where the underlying assumptions of
                  the regularizer are poorly matched to the
                  environment.},
  url =		 {.},
  publisher =	 mit
}

@inproceedings{HelWilGha08,
  author =	 {Katherine A. Heller and Sinead Williamson and Zoubin
                  Ghahramani},
  cat =		 {clust},
  title =	 {Statistical models for partial membership},
  booktitle =	 icml25,
  year =	 2008,
  month =	 {July},
  address =	 {Helsinki, Finland},
  publisher =	 {Omnipress},
  editor =	 {Andrew McCallum and Sam Roweis},
  pages =	 {392--399},
  url =		 {.},
  abstract =	 {We present a principled Bayesian framework for
                  modeling partial memberships of data points to
                  clusters. Unlike a standard mixture model which
                  assumes that each data point belongs to one and only
                  one mixture component, or cluster, a partial
                  membership model allows data points to have
                  fractional membership in multiple
                  clusters. Algorithms which assign data points
                  partial memberships to clusters can be useful for
                  tasks such as clustering genes based on microarray
                  data (Gasch & Eisen, 2002). Our Bayesian Partial
                  Membership Model (BPM) uses exponential family
                  distributions to model each cluster, and a product
                  of these distibtutions, with weighted parameters, to
                  model each datapoint. Here the weights correspond to
                  the degree to which the datapoint belongs to each
                  cluster. All parameters in the BPM are continuous,
                  so we can use Hybrid Monte Carlo to perform
                  inference and learning. We discuss relationships
                  between the BPM and Latent Dirichlet Allocation,
                  Mixed Membership models, Exponential Family PCA, and
                  fuzzy clustering. Lastly, we show some experimental
                  results and discuss nonparametric extensions to our
                  model.}
}

@inproceedings{HerHerDup11,
  cat =		 {np clust},
  author =	 {Daniel Hern\'andez-Lobato and Jos\'e Miguel
                  Hern\'andez-Lobato and Pierre Dupont},
  title =	 {Robust Multi-Class {G}aussian Process
                  Classification},
  booktitle =	 nips25,
  year =	 2011,
  url =		 {.},
  abstract =	 {Multi-class Gaussian Processs Classifiers (MGPCs)
                  are often affected by overfitting problems when
                  labeling errors occur far from the decision
                  boundaries.  To prevent this, we investigate a
                  robust MGPC (RMGPC) which considers labeling errors
                  independently of their distance to the decision
                  boundaries. Expectation propagation is used for
                  approximate inference. Experiments with several
                  datasets in which noise is injected in the labels
                  illustrate the benefits of RMGPC. This method
                  performs better than other Gaussian process
                  alternatives based on considering latent Gaussian
                  noise or heavy-tailed processes. When no noise is
                  injected in the labels, RMGPC still performs equal
                  or better than the other methods. Finally, we show
                  how RMGPC can be used for successfully indentifying
                  data instances which are difficult to classify
                  correctly in practice.}
}

@inproceedings{HoeRasHan00,
  cat =		 {time},
  author =	 {Pedro A.~d.~F.~R.~H{\o}jen-S{\o}rensen and Carl
                  Edward Rasmussen and Lars Kai Hansen},
  title =	 {{B}ayesian modelling of {fMRI} time series},
  booktitle =	 nips12,
  year =	 2000,
  pages =	 {754--760},
  url =		 {.},
  editors =	 {Sara A. Solla, Todd K. Leen and Klaus-Robert
                  M{\"u}ller},
  publisher =	 mit,
  abstract =	 {We present a Hidden Markov Model (HMM) for inferring
                  the hidden psychological state (or neural activity)
                  during single trial fMRI activation experiments with
                  blocked task paradigms. Inference is based on
                  Bayesian methodology, using a combination of
                  analytical and a variety of Markov Chain Monte Carlo
                  (MCMC) sampling techniques. The advantage of this
                  method is that detection of short time learning
                  effects between repeated trials is possible since
                  inference is based only on single trial
                  experiments.}
}

@inproceedings{HueBorKriGha08,
  cat =		 {mcmc},
  month =	 {December},
  address =	 {Pisa, Italy},
  author =	 {C. H\"ubler and K. Borgwardt and H.-P. Kriegel and
                  Z. Ghahramani},
  note =	 {ISSN: 1550-4786},
  booktitle =	 {Proceedings of 8th IEEE International Conference on
                  Data Mining (ICDM 2008)},
  title =	 {{M}etropolis algorithms for representative subgraph
                  sampling},
  publisher =	 {IEEE},
  pages =	 {283--292},
  year =	 2008,
  url =		 {.},
  abstract =	 {While data mining in chemoinformatics studied graph
                  data with dozens of nodes, systems biology and the
                  Internet are now generating graph data with
                  thousands and millions of nodes. Hence data mining
                  faces the algorithmic challenge of coping with this
                  significant increase in graph size: Classic
                  algorithms for data analysis are often too expensive
                  and too slow on large graphs.<br/> While one
                  strategy to overcome this problem is to design novel
                  efficient algorithms, the other is to 'reduce' the
                  size of the large graph by sampling. This is the
                  scope of this paper: We will present novel
                  Metropolis algorithms for sampling a
                  'representative' small subgraph from the original
                  large graph, with 'representative' describing the
                  requirement that the sample shall preserve crucial
                  graph properties of the original graph. In our
                  experiments, we improve over the pioneering work of
                  Leskovec and Faloutsos (KDD 2006), by producing
                  representative subgraph samples that are both
                  smaller and of higher quality than those produced by
                  other methods from the literature.}
}

@techreport{HusHou11,
  title =	 {Adaptive {Bayesian} Qauntum Tomography},
  author =	 {Ferenc Husz\'{a}r and Neil Houlsby},
  year =	 2011,
  institution =	 {University of Cambridge},
  annote =	 {arXiv:<a
                  href="http://arxiv.org/abs/1107.0895">1107.0895</a>},
  url =		 {.},
  abstract =	 {In this letter we revisit the problem of optimal
                  design of quantum tomographic experiments. In
                  contrast to previous approaches where an optimal set
                  of measurements is decided in advance of the
                  experiment, we allow for measurements to be
                  adaptively and efficiently re-optimised depending on
                  data collected so far. We develop an adaptive
                  statistical framework based on Bayesian inference
                  and Shannon's information, and demonstrate a
                  ten-fold reduction in the total number of
                  measurements required as compared to non-adaptive
                  methods, including mutually unbiased bases.}
}

@techreport{HusLac11,
  title =	 {A Kernel Approach to Tractable {Bayesian}
                  Nonparametrics},
  author =	 {Ferenc Husz\'{a}r and Simon Lacoste-Julien},
  year =	 2011,
  institution =	 {University of Cambridge},
  annote =	 {arXiv:<a
                  href="http://arxiv.org/abs/1103.1761">1103.1761</a>},
  url =		 {.},
  abstract =	 { Inference in popular nonparametric Bayesian models
                  typically relies on sampling or other
                  approximations. This paper presents a general
                  methodology for constructing novel tractable
                  nonparametric Bayesian methods by applying the
                  kernel trick to inference in a parametric Bayesian
                  model. For example, Gaussian process regression can
                  be derived this way from Bayesian linear
                  regression. Despite the success of the Gaussian
                  process framework, the kernel trick is rarely
                  explicitly considered in the Bayesian literature. In
                  this paper, we aim to fill this gap and demonstrate
                  the potential of applying the kernel trick to
                  tractable Bayesian parametric models in a wider
                  context than just regression. As an example, we
                  present an intuitive Bayesian kernel machine for
                  density estimation that is obtained by applying the
                  kernel trick to a Gaussian generative model in
                  feature space.}
}

@inproceedings{HusNopLen10,
  author =	 {Ferenc Husz\'{a}r and Uta Noppeney and M\'{a}t\'{e}
                  Lengyel},
  title =	 {Mind reading by machine learning: {A} doubly
                  {Bayesian} method for inferring mental
                  representations},
  booktitle =	 cogsci32,
  year =	 2010,
  month =	 {August},
  editor =	 {S. Ohlsson and R. Catrambone},
  publisher =	 {The Cognitive Science Society},
  address =	 {Austin, TX, USA},
  abstract =	 {A central challenge in cognitive science is to
                  measure and quantify the mental representations
                  humans develop --- in other words, to 'read'
                  subject's minds. In order to eliminate potential
                  biases in reporting mental contents due to verbal
                  elaboration, subjects' responses in experiments are
                  often limited to binary decisions or discrete
                  choices that do not require conscious reflection
                  upon their mental contents. However, it is unclear
                  what such impoverished data can tell us about the
                  potential richness and dynamics of subjects' mental
                  representations. To address this problem, we used
                  ideal observer models that formalise choice
                  behaviour as (quasi-)Bayes-optimal, given subjects'
                  representations in long-term memory, acquired
                  through prior learning, and the stimuli currently
                  available to them. Bayesian inversion of such ideal
                  observer models allowed us to infer subjects' mental
                  representation from their choice behaviour in a
                  variety of psychophysical tasks. The inferred mental
                  representations also allowed us to predict future
                  choices of subjects with reasonable accuracy, even
                  in tasks that were different from those in which the
                  representations were estimated. These results
                  demonstrate a significant potential in standard
                  binary decision tasks to recover detailed
                  information about subjects' mental representations},
  url =		 {.},
  annote =	 {Supplementary material available <a href =
                  "http://mlg.eng.cam.ac.uk/ferenc/mindreading">here</a>.}
}

@inproceedings{KasVanGraHer10,
  author =	 {Kasneci, G. and {Van Gael}, J. and Graepel, T. and
                  Herbrich, R.},
  booktitle =	 {European Conference on Machine Learning (ECML)},
  title =	 {Bayesian Knowledge Corroboration with Logical Rules
                  and User Feedback},
  year =	 2010,
  month =	 {September},
  address =	 {Barcelona, Spain},
  url =		 {.},
  abstract =	 {Current knowledge bases suffer from either low
                  coverage or low accuracy. The underlying hypothesis
                  of this work is that user feedback can greatly
                  improve the quality of automatically extracted
                  knowledge bases. The feedback could help quantify
                  the uncertainty associated with the stored
                  statements and would enable mechanisms for
                  searching, ranking and reasoning at
                  entity-relationship level. Most importantly, a
                  principled model for exploiting user feedback to
                  learn the truth values of statements in the
                  knowledge base would be a major step forward in
                  addressing the issue of knowledge base curation.  We
                  present a family of probabilistic graphical models
                  that builds on user feedback and logical inference
                  rules derived from the popular Semantic-Web
                  formalism of RDFS <a
                  href="http://www.w3.org/TR/rdf-schema/">[1]</a>. Through
                  internal inference and belief propagation, these
                  models can learn both, the truth values of the
                  statements in the knowledge base and the
                  reliabilities of the users who give feedback. We
                  demonstrate the viability of our approach in
                  extensive experiments on real-world datasets, with
                  feedback collected from Amazon Mechanical Turk.}
}

@inproceedings{KimGha08,
  cat =		 {gp},
  volume =	 5342,
  month =	 {December},
  author =	 {H.~Kim and Zoubin Ghahramani},
  series =	 lncs,
  booktitle =	 {Structural, Syntactic and Statistical Pattern
                  Recognition},
  editor =	 {L. Niels da Vitoria},
  title =	 {Outlier robust {Gaussian} process classification},
  address =	 {Berlin, Germany},
  publisher =	 {Springer Berlin / Heidelberg},
  year =	 2008,
  journal =	 lncs,
  pages =	 {896--905},
  url =		 {.},
  abstract =	 {Gaussian process classifiers (GPCs) are a fully
                  statistical model for kernel classification. We
                  present a form of GPC which is robust to labeling
                  errors in the data set. This model allows label
                  noise not only near the class boundaries, but also
                  far from the class boundaries which can result from
                  mistakes in labelling or gross errors in measuring
                  the input features. We derive an outlier robust
                  algorithm for training this model which alternates
                  iterations based on the EP approximation and
                  hyperparameter updates until convergence. We show
                  the usefulness of the proposed algorithm with model
                  selection method through simulation results.}
}

@inproceedings{KnoGaeGha11,
  cat =		 {np approx},
  author =	 {David A.~Knowles and Jurgen Van Gael and Zoubin
                  Ghahramani},
  title =	 {Message Passing Algorithms for the {D}irichlet
                  Diffusion Tree},
  booktitle =	 icml28,
  year =	 2011,
  abstract =	 {We demonstrate efficient approximate inference for
                  the Dirichlet Diffusion Tree (Neal, 2003), a
                  Bayesian nonparametric prior over tree
                  structures. Although DDTs provide a powerful and
                  elegant approach for modeling hierarchies they
                  haven't seen much use to date. One problem is the
                  computational cost of MCMC inference. We provide the
                  first deterministic approximate inference methods
                  for DDT models and show excellent performance
                  compared to the MCMC alternative.  We present
                  message passing algorithms to approximate the
                  Bayesian model evidence for a specific tree. This is
                  used to drive sequential tree building and greedy
                  search to find optimal tree structures,
                  corresponding to hierarchical clusterings of the
                  data. We demonstrate appropriate observation models
                  for continuous and binary data. The empirical
                  performance of our method is very close to the
                  computationally expensive MCMC alternative on a
                  density estimation problem, and significantly
                  outperforms kernel density estimators.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/knowles2011-icml.pdf">web
                  site</a>}
}

@inproceedings{KnoGha07,
  cat =		 {np bioinf},
  url =		 {.},
  author =	 {David Knowles and Zoubin Ghahramani},
  title =	 {Infinite Sparse Factor Analysis and Infinite
                  Independent Components Analysis},
  booktitle =	 {7th International Conference on Independent
                  Component Analysis and Signal Separation},
  year =	 2007,
  month =	 {September},
  address =	 {London, UK},
  publisher =	 {Springer},
  pages =	 {381--388},
  doi =		 {10.1007/978-3-540-74494-8_48},
  abstract =	 {A nonparametric Bayesian extension of Independent
                  Components Analysis (ICA) is proposed where observed
                  data Y is modelled as a linear superposition, G, of
                  a potentially infinite number of hidden sources,
                  X. Whether a given source is active for a specific
                  data point is specified by an infinite binary
                  matrix, Z. The resulting sparse representation
                  allows increased data reduction compared to standard
                  ICA.  We define a prior on Z using the Indian Buffet
                  Process (IBP). We describe four variants of the
                  model, with Gaussian or Laplacian priors on X and
                  the one or two-parameter IBPs. We demonstrate
                  Bayesian inference under these models using a Markov
                  chain Monte Carlo (MCMC) algorithm on synthetic and
                  gene expression data and compare to standard ICA
                  algorithms.}
}

@inproceedings{KnoGha11a,
  cat =		 {np clust},
  author =	 {David A.~Knowles and Zoubin Ghahramani},
  title =	 {Pitman-{Y}or Diffusion Trees},
  booktitle =	 uai27,
  year =	 2011,
  abstract =	 {We introduce the Pitman Yor Diffusion Tree (PYDT)
                  for hierarchical clustering, a generalization of the
                  Dirichlet Diffusion Tree (Neal, 2001) which removes
                  the restriction to binary branching structure. The
                  generative process is described and shown to result
                  in an exchangeable distribution over data points. We
                  prove some theoretical properties of the model and
                  then present two inference methods: a collapsed MCMC
                  sampler which allows us to model uncertainty over
                  tree structures, and a computationally efficient
                  greedy Bayesian EM search algorithm. Both algorithms
                  use message passing on the tree structure. The
                  utility of the model and algorithms is demonstrated
                  on synthetic and real world data, both continuous
                  and binary.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/knowles2011uai.pdf">web
                  site</a>}
}

@article{KnoGha11b,
  cat =		 {np bioinf},
  author =	 {David A. Knowles and Zoubin Ghahramani},
  year =	 2011,
  title =	 {Nonparametric {B}ayesian Sparse Factor Models with
                  application to Gene Expression modelling.},
  journal =	 {Annals of Applied Statistics},
  volume =	 5,
  number =	 {2B},
  pages =	 {1534--1552},
  url =		 {.},
  abstract =	 {A nonparametric Bayesian extension of Factor
                  Analysis (FA) is proposed where observed data Y is
                  modeled as a linear superposition, G, of a
                  potentially infinite number of hidden factors,
                  X. The Indian Buffet Process (IBP) is used as a
                  prior on G to incorporate sparsity and to allow the
                  number of latent features to be inferred. The
                  model's utility for modeling gene expression data is
                  investigated using randomly generated data sets
                  based on a known sparse connectivity matrix for
                  E. Coli, and on three biological data sets of
                  increasing complexity.}
}

@inproceedings{KnoHol09,
  cat =		 {bioinf},
  author =	 {David A. Knowles and Susan Holmes},
  title =	 {Statistical tools for ultra-deep pyrosequencing of
                  fast evolving viruses},
  booktitle =	 {NIPS Workshop: Computational Biology},
  year =	 2009,
  abstract =	 {We aim to detect minor variant Hepatitis B viruses
                  (HBV) in 38 pyrosequencing samples from infected
                  individuals. Errors involved in the amplification
                  and ultra deep pyrosequencing (UDPS) of these
                  samples are characterised using HBV plasmid
                  controls. Homopolymeric regions and quality scores
                  are found to be significant covariates in
                  determining insertion and deletion (indel) error
                  rates, but not mismatch rates which depend on the
                  nucleotide transition matrix. This knowledge is used
                  to derive two methods for classifying genuine
                  mutations: a hypothesis testing framework and a
                  mixture model. Using an approximate "ground truth"
                  from a limiting dilution Sanger sequencing run,
                  these methods are shown to outperform the naive
                  percentage threshold approach. The possibility of
                  early stage PCR errors becoming significant is
                  investigated by simulation, which underlines the
                  importance of the initial copy number.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/nips454.pdf">web
                  site</a>}
}

@inproceedings{KnoMin11,
  cat =		 {np clust},
  author =	 {David A.~Knowles and Thomas P.~Minka},
  title =	 {Non-conjugate Variational Message Passing for
                  Multinomial and Binary Regression},
  booktitle =	 nips25,
  year =	 2011,
  abstract =	 {Variational Message Passing (VMP) is an algorithmic
                  implementation of the Variational Bayes (VB) method
                  which applies only in the special case of conjugate
                  exponential family models. We propose an extension
                  to VMP, which we refer to as Non-conjugate
                  Variational Message Passing (NCVMP) which aims to
                  alleviate this restriction while maintaining
                  modularity, allowing choice in how expectations are
                  calculated, and integrating into an existing
                  message-passing framework: Infer.NET. We demonstrate
                  NCVMP on logistic binary and multinomial
                  regression. In the multinomial case we introduce a
                  novel variational bound for the softmax factor which
                  is tighter than other commonly used bounds whilst
                  maintaining computational tractability.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/nips2011.pdf">web
                  site</a> <a
                  href="http://mlg.eng.cam.ac.uk/dave/nips2011supp.pdf">supplementary</a>}
}

@inproceedings{KnoParGlaWin10,
  cat =		 {bioinf},
  author =	 {David A. Knowles and Leopold Parts and Daniel Glass
                  and John M. Winn},
  title =	 {Modeling skin and ageing phenotypes using latent
                  variable models in Infer.NET},
  booktitle =	 {NIPS Workshop: Predictive Models in Personalized
                  Medicine Workshop},
  year =	 2010,
  abstract =	 {We demonstrate and compare three unsupervised
                  Bayesian latent variable models implemented in
                  Infer.NET for biomedical data modeling of 42 skin
                  and ageing phenotypes measured on the 12,000 female
                  twins in the Twins UK study. We address various data
                  modeling problems include high missingness,
                  heterogeneous data, and repeat observations. We
                  compare the proposed models in terms of their
                  performance at predicting disease labels and
                  symptoms from available explanatory variables,
                  concluding that factor analysis type models have the
                  strongest statistical performance in this
                  setting. We show that such models can be combined
                  with regression components for improved
                  interpretability.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/knowles2010_predictive_medicine.pdf">web
                  site</a>}
}

@inproceedings{KnoParGlaWin11,
  cat =		 {bioinf},
  author =	 {David A. Knowles and Leopold Parts and Daniel Glass
                  and John M. Winn},
  title =	 {Inferring a measure of physiological age from
                  multiple ageing related phenotypes},
  booktitle =	 {NIPS Workshop: From Statistical Genetics to
                  Predictive Models in Personalized Medicine},
  year =	 2011,
  abstract =	 {What is ageing? One definition is simultaneous
                  degradation of multiple organ systems. Can an
                  individual be said to be "old" or "young" for their
                  (chronological) age in a scientifically meaningful
                  way? We investigate these questions using ageing
                  related phenotypes measured on the 12,000 female
                  twins in the Twins UK study. We propose a simple
                  linear model of ageing, which allows a latent
                  adjustment to be made to an individual's
                  chronological age to give her "physiological age",
                  shared across the observed phenotypes. We note
                  problems with the analysis resulting from the
                  linearity assumption and show how to alleviate these
                  issues using a non-linear extension. We find more
                  gene expression probes are significantly associated
                  with our measurement of physiological age than to
                  chronological age. },
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/dave/physiological_age.pdf">web
                  site</a>}
}

@inproceedings{KocBanLiketal03,
  cat =		 {gp},
  booktitle =	 {IFAC Internaltional Conference on Intelligent
                  Control Systems and Signal Processing},
  title =	 {A case based comparison of identification with
                  neural network and {G}aussian process models},
  author =	 {Ju{\v s} Kocijan and Bla{\v z} Banko and Bojan Likar
                  and Agathe Girard and Roderick Murray-Smith and Carl
                  Edward Rasmussen},
  year =	 2003,
  volume =	 1,
  pages =	 {137--142},
  url =		 {.},
  abstract =	 {In this paper an alternative approach to black-box
                  identification of non-linear dynamic systems is
                  compared with the more established approach of using
                  artificial neural networks. The Gaussian process
                  prior approach is a representative of non-parametric
                  modelling approaches. It was compared on a pH
                  process modelling case study. The purpose of
                  modelling was to use the model for control
                  design. The comparison revealed that even though
                  Gaussian process models can be effectively used for
                  modelling dynamic systems caution has to be
                  axercised when signals are selected.}
}

@inproceedings{KocMurRasGir04,
  cat =		 {gp rl},
  author =	 {Ju{\v s} Kocijan and Roderick Murray-Smith and Carl
                  Edward Rasmussen and Agathe Girard},
  title =	 {Gaussian process model based predictive control},
  year =	 2004,
  pages =	 {2214--2219},
  journal =	 {Proceedings of the ACC 2004},
  abstract =	 {Gaussian process models provide a probabilistic
                  non-parametric modelling approach for black-box
                  identi cation of non-linear dynamic systems. The
                  Gaussian processes can highlight areas of the input
                  space where prediction quality is poor, due to the
                  lack of data or its complexity, by indicating the
                  higher variance around the predicted mean. Gaussian
                  process models contain noticeably less coef cients
                  to be optimised. This paper illustrates possible
                  application of Gaussian process models within
                  model-based predictive control. The extra
                  information provided within Gaussian process model
                  is used in predictive control, where optimisation of
                  control signal takes the variance information into
                  account. The predictive control principle is
                  demonstrated on control of pH process benchmark.},
  booktitle =	 {American Control Conference},
  location =	 {Boston, MA},
  url =		 {.}
}

@inproceedings{KocMurRasLik03,
  cat =		 {gp rl},
  title =	 {Predictive control with {G}aussian process models},
  author =	 {Ju{\v s} Kocijan and Roderick Murray-Smith and Carl
                  Edward Rasmussen and Bojan Likar},
  editor =	 {B.~Zajc and M.~Tkal},
  pages =	 {352--356},
  url =		 {.},
  year =	 2003,
  booktitle =	 {IEEE Region 8 Eurocon 2003: Computer as a Tool},
  abstract =	 {This paper describes model-based predictive control
                  based on Gaussian processes. Gaussian process models
                  provide a probabilistic non-parametric modelling
                  approach for black-box identification of non-linear
                  dynamic systems. It offers more insight in variance
                  of obtained model response, as well as fewer
                  parameters to determine than other models. The
                  Gaussian processes can highlight areas of the input
                  space where prediction quality is poor, due to the
                  lack of data or its complexity, by indicating the
                  higher variance around the predicted mean. This
                  property is used in predictive control, where
                  optimisation of control signal takes the variance
                  information into account. The predictive control
                  principle is demonstrated on a simulated example of
                  nonlinear system.}
}

@techreport{KusPfiCsaRas05,
  cat =		 {gp},
  author =	 {Malte Ku{\ss} and Tobias Pfingsten and Lehel
                  Csat{\`o} and Carl Edward Rasmussen},
  title =	 {Approximate Inference for Robust {G}aussian Process
                  Regression},
  year =	 2005,
  institution =	 {Max Planck Institute for Biological Cybernetics},
  number =	 136,
  address =	 {T{\"u}bingen, Germany},
  abstract =	 {Gaussian process (GP) priors have been successfully
                  used in non-parametric Bayesian regression and
                  classification models. Inference can be performed
                  analytically only for the regression model with
                  Gaussian noise. For all other likelihood models
                  inference is intractable and various approximation
                  techniques have been proposed. In recent years
                  expectation-propagation (EP) has been developed as a
                  general method for approximate inference. This
                  article provides a general summary of how
                  expectation-propagation can be used for approximate
                  inference in Gaussian process models. Furthermore we
                  present a case study describing its implementation
                  for a new robust variant of Gaussian process
                  regression. To gain further insights into the
                  quality of the EP approximation we present
                  experiments in which we compare to results obtained
                  by Markov chain Monte Carlo (MCMC) sampling.},
  url =		 {.}
}

@article{KusRas05,
  cat =		 {gp approx},
  author =	 {Malte {Ku\ss} and Carl Edward Rasmussen},
  title =	 {Assessing Approximate Inference for Binary
                  {G}aussian Process Classification},
  journal =	 jmlr,
  year =	 2005,
  volume =	 6,
  pages =	 {1679--1704},
  url =
                  {http://www.jmlr.org/papers/volume6/kuss05a/kuss05a.pdf},
  abstract =	 {Gaussian process priors can be used to define
                  flexible, probabilistic classification
                  models. Unfortunately exact Bayesian inference is
                  analytically intractable and various approximation
                  techniques have been proposed. In this work we
                  review and compare Laplace's method and Expectation
                  Propagation for approximate Bayesian inference in
                  the binary Gaussian process classification model. We
                  present a comprehensive comparison of the
                  approximations, their predictive performance and
                  marginal likelihood estimates to results obtained by
                  MCMC sampling. We explain theoretically and
                  corroborate empirically the advantages of
                  Expectation Propagation compared to Laplace's
                  method.}
}

@inproceedings{KusRas06,
  cat =		 {gp rl},
  author =	 {Malte Ku{\ss} and Carl Edward Rasmussen},
  title =	 {Assessing Approximations for {G}aussian Process
                  Classification},
  year =	 2006,
  publisher =	 mit,
  pages =	 {699--706},
  month =	 {April},
  booktitle =	 nips18,
  editor =	 {Y.~Weiss and B.~Sch{\"o}lkopf and J.~Platt},
  address =	 {Cambridge, MA, USA},
  abstract =	 {Gaussian processes are attractive models for
                  probabilistic classification but unfortunately exact
                  inference is analytically intractable. We compare
                  Laplace's method and Expectation Propagation (EP)
                  focusing on marginal likelihood estimates and
                  predictive performance. We explain theoretically and
                  corroborate empirically that EP is superior to
                  Laplace. We also compare to a sophisticated MCMC
                  scheme and show that EP is surprisingly accurate.},
  location =	 {Whistler, BC, Canada},
  URL =		 {.}
}

@inproceedings{LacHusGha11,
  cat =		 {approx gp},
  url =		 {.},
  author =	 {Simon Lacoste-Julien and Ferenc Husz\'{a}r and
                  Zoubin Ghahramani},
  title =	 {Approximate Inference for the Loss-Calibrated
                  {B}ayesian},
  booktitle =	 aistats14,
  year =	 2011,
  editor =	 {Geoff Gordon and David Dunson},
  volume =	 15,
  pages =   {416--424},
  address =	 {Fort Lauderdale, FL, USA},
  month =	 {April},
  publisher =	 jmlr,
  abstract =	 {We consider the problem of approximate inference in
                  the context of Bayesian decision theory. Traditional
                  approaches focus on approximating general properties
                  of the posterior, ignoring the decision task -- and
                  associated losses -- for which the posterior could
                  be used. We argue that this can be suboptimal and
                  propose instead to \emph{loss-calibrate} the
                  approximate inference methods with respect to the
                  decision task at hand. We present a general
                  framework rooted in Bayesian decision theory to
                  analyze approximate inference from the perspective
                  of losses, opening up several research
                  directions. As a first loss-calibrated approximate
                  inference attempt, we propose an EM-like algorithm
                  on the Bayesian posterior risk and show how it can
                  improve a standard approach to Gaussian process
                  classification when losses are asymmetric.}
}

@article{LazQuiRasFig10,
  cat =		 {gp},
  author =	 {Miguel L\'azaro-Gredilla and Joaquin
                  Qui{\~n}onero-Candela and Carl Edward Rasmussen and
                  An\'{i}bal Figueiras-Vidal},
  title =	 {Sparse Spectrum {G}aussian Process Regression},
  journal =	 jmlr,
  url =
                  {http://jmlr.csail.mit.edu/papers/volume11/lazaro-gredilla10a/lazaro-gredilla10a.pdf},
  volume =	 11,
  pages =	 {1865--1881},
  month =	 {June},
  year =	 2010,
  abstract =	 {We present a new sparse Gaussian Process (GP) model
                  for regression. The key novel idea is to sparsify
                  the \emph{spectral representation} of the GP. This
                  leads to a simple, practical algorithm for
                  regression tasks. We compare the achievable
                  trade-offs between predictive accuracy and
                  computational requirements, and show that these are
                  typically superior to existing state-of-the-art
                  sparse approximations. We discuss both the weight
                  space and function space representations, and note
                  that the new construction implies priors over
                  functions which are always stationary, and can
                  approximate any covariance function in this class.}
}

@article{LesChaKleetal10,
  author =	 {J. Leskovec and D. Chakrabarti and J. Kleinberg and
                  C. Faloutsos and Z. Ghahramani},
  year =	 2010,
  title =	 {Kronecker Graphs: An Approach to Modeling Networks},
  journal =	 jmlr,
  volume =	 {11(Feb)},
  pages =	 {985--1042},
  url =		 {.},
  abstract =	 {How can we generate realistic networks? In addition,
                  how can we do so with a mathematically tractable
                  model that allows for rigorous analysis of network
                  properties? Real networks exhibit a long list of
                  surprising properties: Heavy tails for the in- and
                  out-degree distribution, heavy tails for the
                  eigenvalues and eigenvectors, small diameters, and
                  densification and shrinking diameters over
                  time. Current network models and generators either
                  fail to match several of the above properties, are
                  complicated to analyze mathematically, or both. Here
                  we propose a generative model for networks that is
                  both mathematically tractable and can generate
                  networks that have all the above mentioned
                  structural properties. Our main idea here is to use
                  a non-standard matrix operation, the Kronecker
                  product, to generate graphs which we refer to as
                  "Kronecker graphs".<br/>First, we show that
                  Kronecker graphs naturally obey common network
                  properties. In fact, we rigorously prove that they
                  do so. We also provide empirical evidence showing
                  that Kronecker graphs can effectively model the
                  structure of real networks.<br/>We then present
                  KRONFIT, a fast and scalable algorithm for fitting
                  the Kronecker graph generation model to large real
                  networks. A naive approach to fitting would take
                  super-exponential time. In contrast, KRONFIT takes
                  linear time, by exploiting the structure of
                  Kronecker matrix multiplication and by using
                  statistical simulation techniques.  Experiments on a
                  wide range of large real and synthetic networks show
                  that KRONFIT finds accurate parameters that very
                  well mimic the properties of target networks. In
                  fact, using just four parameters we can accurately
                  model several aspects of global network
                  structure. Once fitted, the model parameters can be
                  used to gain insights about the network structure,
                  and the resulting synthetic graphs can be used for
                  null-models, anonymization, extrapolations, and
                  graph summarization.}
}

@article{LipGhaBor10,
  cat =		 {bioinf},
  author =	 {C. Lippert and Z. Ghahramani and K. Borgwardt},
  title =	 {Gene function prediction from synthetic lethality
                  networks via ranking on demand},
  journal =	 {Bioinformatics},
  year =	 2010,
  volume =	 26,
  pages =	 {912--918},
  url =		 {.},
  abstract =	 {Motivation: Synthetic lethal interactions represent
                  pairs of genes whose individual mutations are not
                  lethal, while the double mutation of both genes does
                  incur lethality. Several studies have shown a
                  correlation between functional similarity of genes
                  and their distances in networks based on synthetic
                  lethal interactions. However, there is a lack of
                  algorithms for predicting gene function from
                  synthetic lethality interaction networks. <br/>
                  Results: In this article, we present a novel
                  technique called kernelROD for gene function
                  prediction from synthetic lethal interaction
                  networks based on kernel machines. We apply our
                  novel algorithm to Gene Ontology functional
                  annotation prediction in yeast.  Our experiments
                  show that our method leads to improved gene function
                  prediction compared with state-of-the-art
                  competitors and that combining genetic and
                  congruence networks leads to a further improvement
                  in prediction accuracy.}
}

@inproceedings{LipSteGhaetal09,
  cat =		 {bioinf},
  volume =	 5,
  author =	 {C. Lippert and O. Stegle and Z. Ghahramani and K.
                  Borgwardt},
  note =	 {ISSN: 1938-7228},
  booktitle =	 aistats12,
  editor =	 {D. van Dyk and M. Welling},
  title =	 {A kernel method for unsupervised structured network
                  inference},
  publisher =	 jmlr,
  year =	 2009,
  month =	 {April},
  address =	 {Clearwater Beach, FL, USA},
  pages =	 {368--375},
  url =		 {.},
  abstract =	 {Network inference is the problem of inferring edges
                  between a set of real-world objects, for instance,
                  interactions between pairs of proteins in
                  bioinformatics. Current kernel-based approaches to
                  this problem share a set of common features: (i)
                  they are supervised and hence require labeled
                  training data; (ii) edges in the network are treated
                  as mutually independent and hence topological
                  properties are largely ignored; (iii) they lack a
                  statistical interpretation. We argue that these
                  common assumptions are often undesirable for network
                  inference, and propose (i) an unsupervised kernel
                  method (ii) that takes the global structure of the
                  network into account and (iii) is statistically
                  motivated. We show that our approach can explain
                  commonly used heuristics in statistical terms. In
                  experiments on social networks, dfferent variants of
                  our method demonstrate appealing predictive
                  performance.}
}

@inproceedings{MacBusCunetal11,
  cat =		 {time},
  booktitle =	 nips25,
  title =	 {Empirical models of spiking in neural populations},
  author =	 {J. H. Macke and L. Busing and J. P. Cunningham and
                  B. M. Yu and K. V. Shenoy and M. Sahani},
  year =	 2011,
  address =	 {Granada, Spain},
  month =	 {December},
  pages =	 {1--8},
  url =		 {.},
  abstract =	 {Neurons in the neocortex code and compute as part of
                  a locally interconnected population. Large-scale
                  multi-electrode recording makes it possible to
                  access these population processes empirically by
                  fitting statistical models to unaveraged data. What
                  statistical structure best describes the concurrent
                  spiking of cells within a local network? We argue
                  that in the cortex, where firing exhibits extensive
                  correlations in both time and space and where a
                  typical sample of neurons still reflects only a very
                  small fraction of the local population, the most
                  appropriate model captures shared variability by a
                  low-dimensional latent process evolving with smooth
                  dynamics, rather than by putative direct
                  coupling. We test this claim by comparing a latent
                  dynamical model with realistic spiking observations
                  to coupled generalised linear spike-response models
                  (GLMs) using cortical recordings. We find that the
                  latent dynamical approach outperforms the GLM in
                  terms of goodness-of- fit, and reproduces the
                  temporal correlations in the data more
                  accurately. We also compare models whose
                  observations models are either derived from a
                  Gaussian or point-process models, finding that the
                  non-Gaussian model provides slightly better
                  goodness-of-fit and more realistic population spike
                  counts.}
}

@inproceedings{MchRas11,
  cat =		 {gp},
  booktitle =	 nips25,
  title =	 {Gaussian Process Training with Input Noise},
  author =	 {Andrew McHutchon and Carl Edward Rasmussen},
  year =	 2011,
  address =	 {Granada, Spain},
  month =	 {December},
  url =		 {http://mlg.eng.cam.ac.uk/mchutchon/papers/NIGP.pdf},
  abstract =	 {In standard Gaussian Process regression input
                  locations are assumed to be noise free. We present a
                  simple yet effective GP model for training on input
                  points corrupted by i.i.d. Gaussian noise. To make
                  computations tractable we use a local linear
                  expansion about each input point. This allows the
                  input noise to be recast as output noise
                  proportional to the squared gradient of the GP
                  posterior mean. The input noise hyperparameters are
                  trained alongside other hyperparameters by the usual
                  method of maximisation of the marginal likelihood,
                  and allow estimation of the noise levels on each
                  input dimension. Training uses an iterative scheme,
                  which alternates between optimising the
                  hyperparameters and calculating the posterior
                  gradient. Analytic predictive moments can then be
                  found for Gaussian distributed test points. We
                  compare our model to others over a range of
                  different regression problems and show that it
                  improves over current methods.}
}

@inproceedings{MeeGhaNeaetal07,
  cat =		 {np},
  month =	 {September},
  author =	 {E. Meeds and Z. Ghahramani and R. Neal and
                  S.T. Roweis},
  series =	 {Bradford Books},
  note =	 {Online contents gives pages 1002--1009, and 977--984
                  on pdf contents.},
  booktitle =	 nips19,
  editor =	 {B. Sch\"olkopf and J. Platt and T. Hofmann},
  title =	 {Modelling dyadic data with binary latent factors},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  year =	 2007,
  pages =	 {977--984},
  url =		 {.},
  abstract =	 {We introduce binary matrix factorization, a novel
                  model for unsupervised matrix decomposition. The
                  decomposition is learned by fitting a non-parametric
                  Bayesian probabilistic model with binary latent
                  variables to a matrix of dyadic data. Unlike
                  bi-clustering models, which assign each row or
                  column to a single cluster based on a categorical
                  hidden feature, our binary feature model reflects
                  the prior belief that items and attributes can be
                  associated with more than one latent cluster at a
                  time. We provide simple learning and inference rules
                  for this new model and show how to extend it to an
                  infinite model in which the number of features is
                  not a priori fixed but is allowed to grow with the
                  size of the data.}
}

@phdthesis{Moh11,
  author =	 {Shakir Mohamed},
  title =	 {Generalised {B}ayesian Matrix Factorisation Models},
  school =	 {University of Cambridge, Department of Engineering},
  year =	 2011,
  address =	 {Cambridge, UK},
  abstract =	 {Factor analysis and related models for probabilistic
                  matrix factorisation are of central importance to
                  the unsupervised analysis of data, with a colourful
                  history more than a century long. Probabilistic
                  models for matrix factorisation allow us to explore
                  the underlying structure in data, and have relevance
                  in a vast number of application areas including
                  collaborative filtering, source separation, missing
                  data imputation, gene expression analysis,
                  information retrieval, computational finance and
                  computer vision, amongst others.<br> This thesis
                  develops generalisations of matrix factorisation
                  models that advance our understanding and enhance
                  the applicability of this important class of
                  models. The generalisation of models for matrix
                  factorisation focuses on three concerns: widening
                  the applicability of latent variable models to the
                  diverse types of data that are currently available;
                  considering alternative structural forms in the
                  underlying representations that are inferred; and
                  including higher order data structures into the
                  matrix factorisation framework. These three issues
                  reflect the reality of modern data analysis and we
                  develop new models that allow for a principled
                  exploration and use of data in these settings. We
                  place emphasis on Bayesian approaches to learning
                  and the advantages that come with the Bayesian
                  methodology. Our port of departure is a
                  generalisation of latent variable models to members
                  of the exponential family of distributions. This
                  generalisation allows for the analysis of data that
                  may be real-valued, binary, counts, non-negative or
                  a heterogeneous set of these data types. The model
                  unifies various existing models and constructs for
                  unsupervised settings, the complementary framework
                  to the generalised linear models in regression.<br>
                  Moving to structural considerations, we develop
                  Bayesian methods for learning sparse latent
                  representations. We define ideas of weakly and
                  strongly sparse vectors and investigate the classes
                  of prior distributions that give rise to these forms
                  of sparsity, namely the scale-mixture of Gaussians
                  and the spike-and-slab distribution. Based on these
                  sparsity favouring priors, we develop and compare
                  methods for sparse matrix factorisation and present
                  the first comparison of these sparse learning
                  approaches. As a second structural consideration, we
                  develop models with the ability to generate
                  correlated binary vectors. Moment-matching is used
                  to allow binary data with specified correlation to
                  be generated, based on dichotomisation of the
                  Gaussian distribution. We then develop a novel and
                  simple method for binary PCA based on Gaussian
                  dichotomisation. The third generalisation considers
                  the extension of matrix factorisation models to
                  multi-dimensional arrays of data that are
                  increasingly prevalent. We develop the first
                  Bayesian model for non-negative tensor factorisation
                  and explore the relationship between this model and
                  the previously described models for matrix
                  factorisation.},
  url =		 {.}
}

@inproceedings{MohHelGha08,
  title =	 {Bayesian Exponential Family {PCA}},
  author =	 {Shakir Mohamed and Katherine A. Heller and Zoubin
                  Ghahramani},
  booktitle =	 nips21,
  editor =	 {D. Koller and D. Schuurmans and Y. Bengio and
                  L. Bottou},
  pages =	 {1089--1096},
  year =	 2009,
  month =	 {December},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {Principal Components Analysis (PCA) has become
                  established as one of the key tools for
                  dimensionality reduction when dealing with real
                  valued data. Approaches such as exponential family
                  PCA and non-negative matrix factorisation have
                  successfully extended PCA to non-Gaussian data
                  types, but these techniques fail to take advantage
                  of Bayesian inference and can suffer from problems
                  of overfitting and poor generalisation. This paper
                  presents a fully probabilistic approach to PCA,
                  which is generalised to the exponential family,
                  based on Hybrid Monte Carlo sampling. We describe
                  the model which is based on a factorisation of the
                  observed data matrix, and show performance of the
                  model on both synthetic and real data.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/shakir/papers/NIPS08spotlight.pdf">spotlight</a>.}
}

@article{MovChoKnoetal11,
  cat =		 {bioinf},
  author =	 {Mehregan Movassagh and Mun-Kit Choy and David
                  A. Knowles and Lina Cordeddu and Syed Haider and
                  Thomas Down and Lee Siggens and Ana Vujic and Ilenia
                  Simeoni and Chris Penkett and Martin Goddard and
                  Pietro Lio and Martin Bennett and Roger Foo},
  title =	 {Distinct epigenomic features in human
                  cardiomyopathy},
  url =
                  {http://circ.ahajournals.org/content/early/2011/10/24/CIRCULATIONAHA.111.040071.abstract},
  journal =	 {Circulation, American Heart Association},
  year =	 2011,
  abstract =	 {Background. The epigenome refers to marks on the
                  genome including DNA methylation and histone
                  modifications that regulate the expression of
                  underlying genes. A consistent profile of gene
                  expression changes in end- stage cardiomyopathy led
                  us to hypothesise that distinct global patterns of
                  the epigenome may also exist.  Methods and
                  Results. We constructed genome-wide maps of DNA
                  methylation and Histone-3 Lysine-36 tri-methylation
                  (H3K36me3)-enrichment for cardiomyopathic and normal
                  human hearts. 506Mb of sequence per library was
                  generated by high-throughput sequencing, covering 24
                  million out of the 28 million CG di-nucleotides in
                  the human genome. DNA methylation was significantly
                  different in promoter CpG-islands (CGI), intra-genic
                  CGI, gene bodies and H3K36me3-enriched regions of
                  the genome. Moreover DNA methylation differences
                  were present in promoters of upregulated genes but
                  not down-regulated genes. The profile of
                  H3K36me3-enrichment itself was also significantly
                  different in protein-coding regions of the genome.
                  Conclusions. Distinct epigenomic patterns exist in
                  important DNA elements of the human cardiac genome
                  in end-stage cardiomyopathy. If epigenomic patterns
                  track with disease progression, assays for the
                  epigenome may be more useful than quantification of
                  mRNA for assessing prognosis in heart failure. These
                  results open up an important new horizon of research
                  and further studies will be needed to determine how
                  epigenomics contribute to altered gene expression in
                  cardiomyopathy.}
}

@inproceedings{MurSbaRasGir03,
  cat =		 {gp rl},
  author =	 {Roderick Murray-Smith and Daniel Sbarbaro and Carl
                  Edward Rasmussen and Agathe Girard},
  title =	 {Adaptive, Cautious, Predictive control with
                  {G}aussian Process Priors},
  year =	 2003,
  publisher =	 {Elsevier Science Ltd},
  pages =	 {1195-1200},
  month =	 {August},
  journal =	 {Proceedings of the 13th IFAC Symposium on System
                  Identification},
  editor =	 {P.~Van den Hof and B.~Wahlberg and S.~Weiland},
  url =		 {.},
  address =	 {Oxford, UK},
  abstract =	 {Nonparametric Gaussian Process models, a Bayesian
                  statistics approach, are used to implement a
                  nonlinear adaptive control law. Predictions,
                  including propagation of the state uncertainty are
                  made over a k-step horizon. The expected value of a
                  quadratic cost function is minimised, over this
                  prediction horizon, without ignoring the variance of
                  the model predictions. The general method and its
                  main features are illustrated on a simulation
                  example.},
  booktitle =	 {IFAC SYSID 2003},
  location =	 {Rotterdam, The Netherlands}
}

@article{NicRas08,
  cat =		 {gp approx},
  author =	 {Hannes Nickisch and Carl Edward Rasmussen},
  title =	 {Approximations for Binary {G}aussian Process
                  Classification},
  year =	 2008,
  volume =	 9,
  pages =	 {2035--2078},
  month =	 {October},
  journal =	 jmlr,
  abstract =	 {We provide a comprehensive overview of many recent
                  algorithms for approximate inference in Gaussian
                  process models for probabilistic binary
                  classification. The relationships between several
                  approaches are elucidated theoretically, and the
                  properties of the different algorithms are
                  corroborated by experimental results. We examine
                  both 1) the quality of the predictive distributions
                  and 2) the suitability of the different marginal
                  likelihood approximations for model selection
                  (selecting hyperparameters) and compare to a gold
                  standard based on MCMC. Interestingly, some methods
                  produce good predictive distributions although their
                  marginal likelihood approximations are poor.  Strong
                  conclusions are drawn about the methods: The
                  Expectation Propagation algorithm is almost always
                  the method of choice unless the computational budget
                  is very tight. We also extend existing methods in
                  various ways, and provide unifying code implementing
                  all approaches.},
  url =
                  {http://www.jmlr.org/papers/volume9/nickisch08a/nickisch08a.pdf}
}

@inproceedings{NicRas10,
  cat =		 {gp},
  author =	 {Hannes Nickisch and Carl Edward Rasmussen},
  title =	 {Gaussian Mixture Modeling with {G}aussian Process
                  Latent Variable Models},
  publisher =	 {Springer},
  series =	 lncs,
  year =	 2010,
  month =	 {September},
  address =	 {Darmstadt, Germany},
  url =		 {.},
  booktitle =	 {Proceedings of the 32nd DAGM Symposium on Pattern
                  Recognition},
  abstract =	 {Density modeling is notoriously difficult for high
                  dimensional data. One approach to the problem is to
                  search for a lower dimensional manifold which
                  captures the main characteristics of the
                  data. Recently, the Gaussian Process Latent Variable
                  Model (GPLVM) has successfully been used to find low
                  dimensional manifolds in a variety of complex
                  data. The GPLVM consists of a set of points in a low
                  dimensional latent space, and a stochastic map to
                  the observed space. We show how it can be
                  interpreted as a density model in the observed
                  space. However, the GPLVM is not trained as a
                  density model and therefore yields bad density
                  estimates. We propose a new training strategy and
                  obtain improved generalisation performance and
                  better density estimates in comparative evaluations
                  on several benchmark data sets.},
  doi =		 {10.1007/978-3-642-15986-2_28}
}

@inproceedings{Orb09,
  cat =		 {np},
  author =	 {Peter Orbanz},
  title =	 {Construction of Nonparametric {B}ayesian Models from
                  Parametric {B}ayes Equations},
  url =		 {.},
  publisher =	 mit,
  booktitle =	 nips22,
  year =	 2009,
  pages =	 {1392--1400},
  editor =	 {Y.~Bengio and D.~Schuurmans and J.~Lafferty and
                  C.~K.~I.~Williams and A.~Culotta},
  abstract =	 {We consider the general problem of constructing
                  nonparametric Bayesian models on
                  infinite-dimensional random objects, such as
                  functions, infinite graphs or infinite
                  permutations. The problem has generated much
                  interest in machine learning, where it is treated
                  heuristically, but has not been studied in full
                  generality in nonparametric Bayesian statistics,
                  which tends to focus on models over probability
                  distributions. Our approach applies a standard tool
                  of stochastic process theory, the construction of
                  stochastic processes from their finite-dimensional
                  marginal distributions. The main contribution of the
                  paper is a generalization of the classic Kolmogorov
                  extension theorem to conditional probabilities. This
                  extension allows a rigorous construction of
                  nonparametric Bayesian models from systems of
                  finitedimensional, parametric Bayes equations. Using
                  this approach, we show (i) how existence of a
                  conjugate posterior for the nonparametric model can
                  be guaranteed by choosing conjugate
                  finite-dimensional models in the construction, (ii)
                  how the mapping to the posterior parameters of the
                  nonparametric model can be explicitly determined,
                  and (iii) that the construction of conjugate models
                  in essence requires the finite-dimensional models to
                  be in the exponential family. As an application of
                  our constructive framework, we derive a model on
                  infinite permutations, the nonparametric Bayesian
                  analogue of a model recently proposed for the
                  analysis of rank data.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/porbanz/reports/NIPS2009_0901_extra.pdf">Supplements
                  (proofs)</a> and <a
                  href="http://mlg.eng.cam.ac.uk/porbanz/reports/porbanz_NIPS09TR.pdf">techreport
                  version</a>}
}

@phdthesis{Ort11,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega},
  title =	 {A Unified Framework for Resource-Bounded Agents
                  Interacting with an Unknown Environment},
  school =	 {Department of Engineering, University of Cambridge},
  year =	 2011,
  abstract =	 {The aim of this thesis is to present a mathematical
                  framework for conceptualizing and constructing
                  adaptive autonomous systems under resource
                  constraints. The first part of this thesis contains
                  a concise presentation of the foundations of
                  classical agency: namely the formalizations of
                  decision making and learning. Decision making
                  includes: (a) subjective expected utility (SEU)
                  theory, the framework of decision making under
                  uncertainty; (b) the maximum SEU principle to choose
                  the optimal solution; and (c) its application to the
                  design of autonomous systems, culminating in the
                  Bellman optimality equations. Learning includes: (a)
                  Bayesian probability theory, the theory for
                  reasoning under uncertainty that extends logic; and
                  (b) Bayes-Optimal agents, the application of
                  Bayesian probability theory to the design of optimal
                  adaptive agents. Then, two major problems of the
                  maximum SEU principle are highlighted: (a) the
                  prohibitive computational costs and (b) the need for
                  the causal precedence of the choice of the
                  policy. The second part of this thesis tackles the
                  two aforementioned problems.  First, an
                  information-theoretic notion of resources in
                  autonomous systems is established. Second, a
                  framework for resource-bounded agency is
                  introduced. This includes: (a) a maximum bounded SEU
                  principle that is derived from a set of axioms of
                  utility; (b) an axiomatic model of probabilistic
                  causality, which is applied for the formalization of
                  autonomous systems having uncertainty over their
                  policy and environment; and (c) the Bayesian control
                  rule, which is derived from the maximum bounded SEU
                  principle and the model of causality, implementing a
                  stochastic adaptive control law that deals with the
                  case where autonomous agents are uncertain about
                  their policy and environment.},
  url =
                  {http://www.dcc.uchile.cl/~peortega/home/lib/exe/fetch.php?id=autonomous_agents&cache=cache&media=thesis.pdf}
}

@inproceedings{OrtBra10a,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun},
  title =	 {A conversion between utility and information},
  booktitle =	 {The third conference on artificial general
                  intelligence},
  publisher =	 {Atlantis Press},
  address =	 {Paris},
  pages =	 {115--120},
  year =	 2010,
  url =
                  {http://agi-conf.org/2010/wp-content/uploads/2009/06/paper_35.pdf},
  abstract =	 {Rewards typically express desirabilities or
                  preferences over a set of alternatives. Here we
                  propose that rewards can be defined for any
                  probability distribution based on three desiderata,
                  namely that rewards should be real- valued, additive
                  and order-preserving, where the later implies that
                  more probable events should also be more
                  desirable. Our main result states that rewards are
                  then uniquely determined by the negative information
                  content. To analyze stochastic processes, we define
                  the utility of a realization as its reward rate.
                  Under this interpretation, we show that the expected
                  utility of a stochastic process is its negative
                  entropy rate. Furthermore, we apply our results to
                  analyze agent-environment interactions. We show that
                  the expected utility that will actually be achieved
                  by the agent is given by the negative cross-entropy
                  from the input-output (I/O) distribution of the
                  coupled interaction system and the agent's I/O
                  distribution. Thus, our results allow for an
                  information-theoretic interpretation of the notion
                  of utility and the characterization of
                  agent-environment interactions in terms of entropy
                  dynamics.}
}

@inproceedings{OrtBra10b,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun},
  title =	 {A {B}ayesian rule for adaptive control based on
                  causal interventions},
  booktitle =	 {The third conference on artificial general
                  intelligence},
  publisher =	 {Atlantis Press},
  address =	 {Paris},
  pages =	 {115--120},
  year =	 2010,
  url =
                  {http://agi-conf.org/2010/wp-content/uploads/2009/06/paper_28.pdf},
  abstract =	 {Explaining adaptive behavior is a central problem in
                  artificial intelligence research. Here we formalize
                  adaptive agents as mixture distributions over
                  sequences of inputs and outputs (I/O). Each
                  distribution of the mixture constitutes a "possible
                  world", but the agent does not know which of the
                  possible worlds it is actually facing. The problem
                  is to adapt the I/O stream in a way that is
                  compatible with the true world. A natural measure of
                  adaptation can be obtained by the Kullback Leibler
                  (KL) divergence between the I/O distribution of the
                  true world and the I/O distribution expected by the
                  agent that is uncertain about possible worlds. In
                  the case of pure input streams, the Bayesian mixture
                  provides a well-known solution for this problem. We
                  show, however, that in the case of I/O streams this
                  solution breaks down, because outputs are issued by
                  the agent itself and require a different
                  probabilistic syntax as provided by intervention
                  calculus. Based on this calculus, we obtain a
                  Bayesian control rule that allows modeling adaptive
                  behavior with mixture distributions over I/O
                  streams. This rule might allow for a novel approach
                  to adaptive control based on a minimum
                  KL-principle.}
}

@article{OrtBra10c,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun},
  title =	 {A minimum relative entropy principle for learning
                  and acting},
  journal =	 {Journal of Artificial Intelligence Research},
  volume =	 38,
  pages =	 {475--511},
  year =	 2010,
  url =		 {http://www.jair.org/vol/vol38.html},
  abstract =	 {This paper proposes a method to construct an
                  adaptive agent that is universal with respect to a
                  given class of experts, where each expert is
                  designed specifically for a particular
                  environment. This adaptive control problem is
                  formalized as the problem of minimizing the relative
                  entropy of the adaptive agent from the expert that
                  is most suitable for the unknown environment. If the
                  agent is a passive observer, then the optimal
                  solution is the well-known Bayesian
                  predictor. However, if the agent is active, then its
                  past actions need to be treated as causal
                  interventions on the I/O stream rather than normal
                  probability conditions.  Here it is shown that the
                  solution to this new variational problem is given by
                  a stochastic controller called the Bayesian control
                  rule, which implements adaptive behavior as a
                  mixture of experts. Furthermore, it is shown that
                  under mild assumptions, the Bayesian control rule
                  converges to the control law of the most suitable
                  expert.},
  doi =		 {10.1613/jair.3062}
}

@techreport{OrtBra10d,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun},
  title =	 {An axiomatic formalization of bounded rationality
                  based on a utility-information equivalence},
  institution =	 {Dept. of Engineering, University of Cambridge},
  year =	 2010,
  url =		 {http://arxiv.org/abs/1007.0940},
  abstract =	 {Classic decision-theory is based on the maximum
                  expected utility (MEU) principle, but crucially
                  ignores the resource costs incurred when determining
                  optimal decisions. Here we propose an axiomatic
                  framework for bounded decision-making that considers
                  resource costs. Agents are formalized as probability
                  measures over input-output streams. We postulate
                  that any such probability measure can be assigned a
                  corresponding conjugate utility function based on
                  three axioms: utilities should be real-valued,
                  additive and monotonic mappings of probabilities. We
                  show that these axioms enforce a unique conversion
                  law between utility and probability (and thereby,
                  information).  Moreover, we show that this relation
                  can be characterized as a variational principle:
                  given a utility function, its conjugate probability
                  measure maximizes a free utility
                  functional. Transformations of probability measures
                  can then be formalized as a change in free utility
                  due to the addition of new constraints expressed by
                  a target utility function. Accordingly, one obtains
                  a criterion to choose a probability measure that
                  trades off the maximization of a target utility
                  function and the cost of the deviation from a
                  reference distribution. We show that optimal
                  control, adaptive estimation and adaptive control
                  problems can be solved this way in a
                  resource-efficient way. When resource costs are
                  ignored, the MEU principle is recovered. Our
                  formalization might thus provide a principled
                  approach to bounded rationality that establishes a
                  close link to information theory.}
}

@inproceedings{OrtBra11,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun},
  title =	 {Information, Utility and Bounded Rationality},
  booktitle =	 {The fourth conference on artificial general
                  intelligence},
  year =	 2011,
  volume =	 6830,
  pages =	 {269--274},
  series =	 {Lecture Notes on Artificial Intelligence},
  publisher =	 {Springer-Verlag},
  abstract =	 {Perfectly rational decision-makers maximize expected
                  utility, but crucially ignore the resource costs
                  incurred when determining optimal actions. Here we
                  employ an axiomatic framework for bounded rational
                  decision-making based on a thermodynamic
                  interpretation of resource costs as information
                  costs. This leads to a variational free utility
                  principle akin to thermodynamical free energy that
                  trades off utility and information costs. We show
                  that bounded optimal control solutions can be
                  derived from this variational principle, which leads
                  in general to stochastic policies. Furthermore, we
                  show that risk-sensitive and robust (minimax)
                  control schemes fall out naturally from this
                  framework if the environment is considered as a
                  bounded rational and perfectly rational opponent,
                  respectively. When resource costs are ignored, the
                  maximum expected utility principle is recovered.},
  url =
                  {http://www.dcc.uchile.cl/~peortega/home/lib/exe/fetch.php?id=autonomous_agents&cache=cache&media=infoutilityboundedrationality.pdf}
}

@inproceedings{OrtBraGod11,
  cat =		 {rl},
  author =	 {Pedro A.~Ortega and Daniel A.~Braun and Simon
                  Godsill},
  title =	 {Reinforcement Learning and the {B}ayesian Control
                  Rule},
  booktitle =	 {The fourth conference on artificial general
                  intelligence},
  year =	 2011,
  volume =	 6830,
  pages =	 {281--285},
  series =	 {Lecture Notes on Artificial Intelligence},
  publisher =	 {Springer-Verlag},
  abstract =	 {We present an actor-critic scheme for reinforcement
                  learning in complex domains. The main contribution
                  is to show that planning and I/O dynamics can be
                  separated such that an intractable planning problem
                  reduces to a simple multi-armed bandit problem,
                  where each lever stands for a potentially
                  arbitrarily complex policy. Furthermore, we use the
                  Bayesian control rule to construct an adaptive
                  bandit player that is universal with respect to a
                  given class of optimal bandit players, thus
                  indirectly constructing an adaptive agent that is
                  universal with respect to a given class of
                  policies.},
  url =
                  {http://www.dcc.uchile.cl/~peortega/home/lib/exe/fetch.php?id=autonomous_agents&cache=cache&media=actor-critic.pdf}
}

@incollection{PerGhaPon07,
  cat =		 {gm},
  month =	 {September},
  author =	 {F. P\'{e}rez-Cruz and Zoubin Ghahramani and
                  M. Pontil},
  note =	 {Chapter 12.},
  booktitle =	 {Predicting Structured Data},
  editor =	 {G.~H.~Bakir and T.~Hofmann and B.~Sch\"olkopf and
                  A.~J.~Smola and B.~Taskar and S.~V.~N.~Vishwanathan},
  address =	 {Cambridge, MA, USA},
  title =	 {Conditional graphical models},
  publisher =	 mit,
  year =	 2007,
  pages =	 {265--282},
  url =		 {.},
  abstract =	 {In this chapter we propose a modification of
                  CRF-like algorithms that allows for solving
                  large-scale structured classification problems. Our
                  approach consists in upper bounding the CRF
                  functional in order to decompose its training into
                  independent optimisation problems per
                  clique. Furthermore we show that each sub-problem
                  corresponds to solving a multiclass learning task in
                  each clique, which enlarges the applicability of
                  these tools for large-scale structural learning
                  problems.  Before presenting the Conditional
                  Graphical Model (CGM), as we refer to this
                  procedure, we review the family of CRF
                  algorithms. We concentrate on the best known
                  procedures and standard generalisations of CRFs. The
                  ob jective of this introduction is analysing from
                  the same viewpoint the proposed solutions in the
                  literature to tackle this problem, which allows
                  comparing their different features.  We complete the
                  chapter with a case study, in which we show the
                  possibility to work with large-scale problems using
                  CGM and that the obtained performance is comparable
                  to the result with CRF-like algorithms.}
}

@inproceedings{PetYuCunetal11,
  cat =		 {time},
  booktitle =	 nips25,
  title =	 {Dynamical Segmentation of single trials from
                  population neural data},
  author =	 {B. Petreska and B. M. Yu and J. P. Cunningham and
                  G. Santhanam and S. I. Ryu and K. V. Shenoy and
                  M. Sahani},
  year =	 2011,
  address =	 {Granada, Spain},
  month =	 {December},
  pages =	 {1--8},
  url =		 {.},
  abstract =	 {Simultaneous recordings of many neurons embedded
                  within a recurrently-connected cortical network may
                  provide concurrent views into the dynamical
                  processes of that network, and thus its
                  computational function. In principle, these dynamics
                  might be identified by purely unsupervised,
                  statistical means. Here, we show that a Hidden
                  Switching Linear Dynamical Systems (HSLDS) model -
                  in which multiple linear dynamical laws approximate
                  and nonlinear and potentially non-stationary
                  dynamical process - is able to distinguish dynamical
                  regimes within single-trial motor cortical activity
                  associated with the preparation and initiation of
                  hand movements. The regimes are identified without
                  reference to behavioural or experimental epochs, but
                  nonetheless transitions between them correlate
                  strongly with external events whose timing may vary
                  from trial to trial. The HSLDS model also performs
                  better than recent comparable models in predicting
                  the firing rate of an isolated neuron based on the
                  firing rates of others, suggesting that it captures
                  more of the "Shared variance" of the data. Thus, the
                  method is able to trace the dynamical processes
                  underlying the coordinated evolution of network
                  activity in a way that appears to reflect its
                  computational role.}
}

@article{PfiHerRas06,
  annote =	 {Winner of the 2006 Best Paper Award for the
                  journal.},
  author =	 {Tobias Pfingsten and Daniel Herrmann and Carl Edward
                  Rasmussen},
  journal =	 {IEEE Transactions on Semiconductor Manufacturing},
  number =	 4,
  pages =	 {475--486},
  title =	 {Model-based design analysis and yield optimization},
  url =		 {http://dx.doi.org/10.1109/TSM.2006.883589},
  doi =		 {10.1109/TSM.2006.883589},
  year =	 2006,
  volume =	 19,
  abstract =	 {Fluctuations are inherent to any fabrication
                  process. Integrated circuits and
                  microelectromechanical systems are particularly
                  affected by these variations, and due to
                  high-quality requirements the effect on the devices'
                  perform ance has to be understood quantitatively. In
                  recent years, it has become possible to model the
                  performance of such complex systems on the basis of
                  design specifications, and model-based sensitivity
                  analysis has made its way into industrial
                  engineering. We show how an efficient Bayesian
                  approach, using a Gaussian process prior, can
                  replace the commonly used brute-force Monte Carlo
                  scheme, making it possible to apply the analysis to
                  computationally costly models. We introduce a number
                  of global, statistically justified sensitivity
                  measures for design analysis and optimization. Two
                  models of integrated systems serve us as case
                  studies to introduce the analysis and to assess its
                  convergence properties. We show that the Bayesian
                  Monte Carlo scheme can save costly simulation runs
                  and can ensure a reliable accuracy of the analysis.}
}

@inproceedings{QuiGirLarRas03,
  cat =		 {gp time},
  author =	 {Joaquin Qui{\~n}onero-Candela and Agathe Girard and
                  Jan Larsen and Carl Edward Rasmussen},
  title =	 {Propagation of Uncertainty in {B}ayesian Kernel
                  Models - Application to Multiple-Step Ahead
                  Forecasting},
  year =	 2003,
  month =	 {April},
  volume =	 2,
  pages =	 {701--704},
  journal =	 {IEEE International Conference on Acoustics, Speech
                  and Signal Processing},
  abstract =	 {The object of Bayesian modelling is the predictive
                  distribution, which in a forecasting scenario
                  enables improved estimates of forecasted values and
                  their uncertainties. In this paper we focus on
                  reliably estimating the predictive mean and variance
                  of forecasted values using Bayesian kernel based
                  models such as the Gaussian Process and the
                  Relevance Vector Machine. We derive novel analytic
                  expressions for the predictive mean and variance for
                  Gaussian kernel shapes under the assumption of a
                  Gaussian input distribution in the static case, and
                  of a recursive Gaussian predictive density in
                  iterative forecasting. The capability of the method
                  is demonstrated for forecasting of time-series and
                  compared to approximate methods.},
  url =		 {.},
  booktitle =	 {ICASSP 2003},
  location =	 {Hong Kong}
}

@inproceedings{QuiGirLarRas03b,
  author =	 {Joaquin Qui{\~n}onero-Candela and Agathe Girard and
                  Jan Larsen and Carl Edward Rasmussen},
  title =	 {Propagation of Uncertainty in {B}ayesian Kernel
                  Models - Application to Multiple-Step Ahead
                  Forecasting},
  year =	 2003,
  publisher =	 {IEEE Press},
  annote =	 {Electronic version of <a
                  href="#QuiGirLarRas03">Qui{\~n}onero-Candela,
                  Girard, Larsen and Rasmussen, 2003</a> which should
                  have been presented at ICASSP 03, but was cancelled
                  due to bird flu epidemic.},
  journal =	 {Proceedings of 2003 IEEE International Workshop on
                  Neural Networks for Signal Processing},
  editor =	 {C.~Molina and T.~Adali and J.~Larsen and M.~Van
                  Hulle and S.~C.~Douglas and J.~Rouat},
  address =	 {Piscataway, New Jersey},
  url =		 {#QuiGirLarRas03},
  abstract =	 {The object of Bayesian modelling is the predictive
                  distribution, which in a forecasting scenario
                  enables improved estimates of forecasted values and
                  their uncertainties. In this paper we focus on
                  reliably estimating the predictive mean and variance
                  of forecasted values using Bayesian kernel based
                  models such as the Gaussian Process and the
                  Relevance Vector Machine. We derive novel analytic
                  expressions for the predictive mean and variance for
                  Gaussian kernel shapes under the assumption of a
                  Gaussian input distribution in the static case, and
                  of a recursive Gaussian predictive density in
                  iterative forecasting. The capability of the method
                  is demonstrated for forecasting of time-series and
                  compared to approximate methods.},
  booktitle =	 {NNSP 2003},
  location =	 {Toulouse}
}

@techreport{QuiGirRas03,
  cat =		 {gp time},
  title =	 {Prediction at an uncertain input for {G}aussian
                  processes and {Relevance Vector Machines}
                  Application to multiple-step ahead time-series
                  prediction},
  author =	 {Joaquin Qui{\~n}onero-Candela and Agathe Girard and
                  Carl Edward Rasmussen},
  institution =	 {Instititute for Mathemetical Modelling, DTU},
  number =	 {IMM-2003-18},
  year =	 2003,
  url =		 {.},
  annote =	 {<a
                  href="http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=2800">techreport</a>}
}

@article{QuiRas05,
  cat =		 {gp approx},
  author =	 {Joaquin Qui{\~n}onero-Candela and Carl Edward
                  Rasmussen},
  title =	 {A Unifying View of Sparse Approximate {G}aussian
                  Process Regression},
  journal =	 jmlr,
  year =	 2005,
  pages =	 {1939--1959},
  volume =	 6,
  url =
                  {http://jmlr.csail.mit.edu/papers/volume6/quinonero-candela05a/quinonero-candela05a.pdf},
  abstract =	 {We provide a new unifying view, including all
                  existing proper probabilistic sparse approximations
                  for Gaussian process regression. Our approach relies
                  on expressing the effective prior which the methods
                  are using. This allows new insights to be gained,
                  and highlights the relationship between existing
                  methods. It also allows for a clear theoretically
                  justified ranking of the closeness of the known
                  approximations to the corresponding full
                  GPs. Finally we point directly to designs of new
                  better sparse approximations, combining the best of
                  the existing strategies, within attractive
                  computational constraints.}
}

@incollection{QuiRas05b,
  cat =		 {gp},
  author =	 {Joaquin Qui{\~n}onero-Candela and Carl Edward
                  Rasmussen},
  booktitle =	 {Switching and Learning in Feedback Systems},
  title =	 {Analysis of Some Methods for Reduced Rank {G}aussian
                  Process Regression},
  year =	 2005,
  publisher =	 {Springer},
  pages =	 {98--127},
  editor =	 {R.~Murray-Smith and R.~Shorten},
  address =	 {Berlin, Heidelberg},
  abstract =	 {While there is strong motivation for using Gaussian
                  Processes (GPs) due to their excellent performance
                  in regression and classification problems, their
                  computational complexity makes them impractical when
                  the size of the training set exceeds a few thousand
                  cases. This has motivated the recent proliferation
                  of a number of cost-effective approximations to GPs,
                  both for classification and for regression.  In this
                  paper we analyze one popular approximation to GPs
                  for regression: the reduced rank
                  approximation. While generally GPs are equivalent to
                  infinite linear models, we show that Reduced Rank
                  Gaussian Processes (RRGPs) are equivalent to finite
                  sparse linear models. We also introduce the concept
                  of degenerate GPs and show that they correspond to
                  inappropriate priors. We show how to modify the RRGP
                  to prevent it from being degenerate at test
                  time. Training RRGPs consists both in learning the
                  covariance function hyperparameters and the support
                  set. We propose a method for learning
                  hyperparameters for a given support set. We also
                  review the Sparse Greedy GP (SGGP) approximation
                  (Somla and Bartlett, 2001), which is a way of
                  learning the support set for given hyperparameters
                  based on approximating the posterior. We propose an
                  alternative method to the SGGP that has better
                  generalization capabilities. Finally we make
                  experiments to compare the different ways of
                  training a RRGP. We provide some Matlab code for
                  learning RRGPs.},
  url =		 {.}
}

@inproceedings{QuiRasSinetal06,
  author =	 {Joaquin Qui{\~n}onero-Candela and Carl Edward
                  Rasmussen and Fabian Sinz and Olivier Bousquet and
                  Bernhard Sch{\"o}lkopf},
  title =	 {Evaluating Predictive Uncertainty Challenge},
  year =	 2006,
  publisher =	 {Springer},
  pages =	 {1--27},
  month =	 04,
  volume =	 3944,
  series =	 lncs,
  editor =	 {J.~Qui{\~n}onero-Candela and I.~Dagan and B.~Magnini
                  and F.~d'Alch{\'e}{-}Buc},
  address =	 {Berlin, Germany},
  abstract =	 {This Chapter presents the PASCAL1 Evaluating
                  Predictive Uncertainty Challenge, introduces the
                  contributed Chapters by the participants who
                  obtained outstanding results, and provides a
                  discussion with some lessons to be learnt. The
                  Challenge was set up to evaluate the ability of
                  Machine Learning algorithms to provide good
                  "probabilistic predictions", rather than just the
                  usual "point predictions" with no measure of
                  uncertainty, in regression and classification
                  problems. Participants had to compete on a number of
                  regression and classification tasks, and were
                  evaluated by both traditional losses that only take
                  into account point predictions and losses we
                  proposed that evaluate the quality of the
                  probabilistic predictions.},
  booktitle =	 {Machine Learning Challenges. Evaluating predictive
                  uncertainty, visual object classification and
                  recognising tectual entailment. First PASCAL Machine
                  Learning Challenges Workshop},
  location =	 {Southampton, United Kingdom},
  url =		 {.},
  doi =		 {10.1007/11736790_1}
}

@incollection{QuiRasWil07,
  cat =		 {gp},
  booktitle =	 {Large-Scale Kernel Machines},
  author =	 {Joaquin Qui{\~n}onero-Candela and Carl Edward
                  Rasmussen and Christopher K.~I.~Williams},
  title =	 {Approximation Methods for {G}aussian Process
                  Regression},
  year =	 2007,
  series =	 {Neural Information Processing},
  publisher =	 mit,
  pages =	 {203--223},
  month =	 {September},
  editor =	 {L.~Bottou and O.~Chapelle and D.~DeCoste and
                  J.~Weston},
  address =	 {Cambridge, MA, USA},
  abstract =	 {A wealth of computationally efficient approximation
                  methods for Gaussian process regression have been
                  recently proposed. We give a unifying overview of
                  sparse approximations, following <a
                  href="#QuiRas05">Qui{\~n}onero-Candela and Rasmussen
                  (2005)</a>, and a brief review of approximate
                  matrix-vector multiplication methods.},
  URL =		 {.},
  annote =	 {<a
                  href="http://mitpress.mit.edu/9780262026253">book</a>}
}

@inproceedings{Ras00,
  cat =		 {np},
  author =	 {Carl Edward Rasmussen},
  title =	 {The Infinite {G}aussian Mixture Model},
  booktitle =	 nips12,
  year =	 2000,
  pages =	 {554--560},
  url =		 {.},
  editors =	 {Sara A. Solla, Todd K. Leen and Klaus-Robert
                  M{\"u}ller},
  publisher =	 mit,
  abstract =	 {In a Bayesian mixture model it is not necessary a
                  priori to limit the number of components to be
                  finite. In this paper an infinite Gaussian mixture
                  model is presented which neatly sidesteps the
                  difficult problem of finding the "right" number of
                  mixture components. Inference in the model is done
                  using an efficient parameter-free Markov Chain that
                  relies entirely on Gibbs sampling.}
}

@incollection{Ras03,
  cat =		 {gp mcmc},
  author =	 {Carl Edward Rasmussen},
  title =	 {{G}aussian Processes to Speed up {H}ybrid {M}onte
                  {C}arlo for Expensive {B}ayesian Integrals},
  booktitle =	 {Bayesian Statistics 7},
  pages =	 {651--659},
  url =		 {.},
  publisher =	 oup,
  year =	 2003,
  editors =	 {J.~M.~Bernardo and M.~J.~Bayarri and J.~O.~Berger
                  and A.~P.~Dawid and D.~Heckerman and A.~F.~M.~Smith
                  and M.~West},
  abstract =	 {Hybrid Monte Carlo (HMC) is often the method of
                  choice for computing Bayesian integrals that are not
                  analytically tractable. However the success of this
                  method may require a very large number of
                  evaluations of the (un-normalized) posterior and its
                  partial derivatives. In situations where the
                  posterior is computationally costly to evaluate,
                  this may lead to an unacceptable computational load
                  for HMC.  I propose to use a Gaussian Process model
                  of the (log of the) posterior for most of the
                  computations required by HMC. Within this scheme
                  only occasional evaluation of the actual posterior
                  is required to guarantee that the samples generated
                  have exactly the desired distribution, even if the
                  GP model is somewhat inaccurate. The method is
                  demonstrated on a 10 dimensional problem, where 200
                  evaluations suffice for the generation of 100
                  roughly independent points from the posterior. Thus,
                  the proposed scheme allows Bayesian treatment of
                  models with posteriors that are computationally
                  demanding, such as models involving computer
                  simulation.}
}

@incollection{Ras04,
  cat =		 {gp review},
  booktitle =	 {Advanced Lectures on Machine Learning: ML Summer
                  Schools 2003, Canberra, Australia, February 2 - 14,
                  2003, T{\"u}bingen, Germany, August 4 - 16, 2003,
                  Revised Lectures},
  author =	 {Carl Edward Rasmussen},
  title =	 {Gaussian Processes in Machine Learning},
  year =	 2004,
  volume =	 3176,
  series =	 lncs,
  publisher =	 {Springer-Verlag},
  pages =	 {63--71},
  annote =	 {Copyright by Springer, <a
                  href="http://www.springerlink.com/content/lrh41y849xdh">springerlink</a>},
  editor =	 {Olivier Bousquet and Ulrike von Luxburg and Gunnar
                  R{\"a}tsch},
  address =	 {Heidelberg},
  url =		 {.},
  abstract =	 {We give a basic introduction to Gaussian Process
                  regression models. We focus on understanding the
                  role of the stochastic process and how it is used to
                  define a distribution over functions. We present the
                  simple equations for incorporating training data and
                  examine how to learn the hyperparameters using the
                  marginal likelihood. We explain the practical
                  advantages of Gaussian Process and end with
                  conclusions and a look at the current trends in GP
                  work.}
}

@inproceedings{Ras96,
  cat =		 {mcmc},
  author =	 {Carl Edward Rasmussen},
  title =	 {A practical {M}onte {C}arlo implementation of
                  {B}ayesian learning},
  booktitle =	 nips8,
  editors =	 {D.~S.~Touretzky and M.~C.~Mozer and M.~E.~Hasselmo},
  pages =	 {598--604},
  url =		 {.},
  publisher =	 mit,
  address =	 {Cambridge, MA., USA},
  year =	 1996,
  abstract =	 {A practical method for Bayesian training of
                  feed-forward neural networks using sophisticated
                  Monte Carlo methods is presented and evaluated. In
                  reasonably small amounts of computer time this
                  approach outperforms other state-of-the-art methods
                  on 5 datalimited tasks from real world domains.}
}

@phdthesis{Ras96b,
  cat =		 {gp},
  author =	 {Carl Edward Rasmussen},
  title =	 {Evaluation of {G}aussian Processes and other Methods
                  for non-linear Regression},
  year =	 1996,
  url =		 {.},
  school =	 {University of Toronto, Department of Computer
                  Science},
  address =	 {Toronto, CANADA},
  abstract =	 {This thesis develops two Bayesian learning methods
                  relying on Gaussian processes and a rigorous
                  statistical approach for evaluating such methods.
                  In these experimental designs the sources of
                  uncertainty in the estimated generalisation
                  performances due to both variation in training and
                  test sets are accounted for. The framework allows
                  for estimation of generalisation performance as well
                  as statistical tests of significance for pairwise
                  comparisons. Two experimental designs are
                  recommended and supported by the DELVE software
                  environment.<br> Two new non-parametric Bayesian
                  learning methods relying on Gaussian process priors
                  over functions are developed.  These priors are
                  controlled by hyperparameters which set the
                  characteristic length scale for each input
                  dimension. In the simplest method, these parameters
                  are fit from the data using optimization.  In the
                  second, fully Bayesian method, a Markov chain Monte
                  Carlo technique is used to integrate over the
                  hyperparameters. One advantage of these Gaussian
                  process methods is that the priors and
                  hyperparameters of the trained models are easy to
                  interpret.<br> The Gaussian process methods are
                  benchmarked against several other methods, on
                  regression tasks using both real data and data
                  generated from realistic simulations. The
                  experiments show that small datasets are unsuitable
                  for benchmarking purposes because the uncertainties
                  in performance measurements are large. A second set
                  of experiments provide strong evidence that the
                  bagging procedure is advantageous for the
                  Multivariate Adaptive Regression Splines (MARS)
                  method.<br> The simulated datasets have controlled
                  characteristics which make them useful for
                  understanding the relationship between properties of
                  the dataset and the performance of different
                  methods. The dependency of the performance on
                  available computation time is also investigated. It
                  is shown that a Bayesian approach to learning in
                  multi-layer perceptron neural networks achieves
                  better performance than the commonly used early
                  stopping procedure, even for reasonably short
                  amounts of computation time. The Gaussian process
                  methods are shown to consistently outperform the
                  more conventional methods.}
}

@proceedings{RasBueGieSch04,
  title =	 {Pattern Recognition: 26th {DAGM} Symposium},
  editor =	 {Carl Edward Rasmussen and Heinrich H.~B{\"u}lthoff
                  and Martin A.~Giese and Bernhard Sch{\"o}lkopf},
  year =	 2004,
  publisher =	 {Springer},
  pages =	 581,
  month =	 {August},
  series =	 lncs,
  volume =	 3175,
  address =	 {Berlin, Germany},
  location =	 {T{\"u}bingen, Germany},
  url =		 {http://dx.doi.org/10.1007/b99676},
  doi =		 {10.1007/b99676}
}

@article{RasCruGhaWil09,
  cat =		 {clust bioinf},
  author =	 {Carl Edward Rasmussen and Bernhard J.~de la Cruz and
                  Zoubin Ghahramani and David L.~Wild},
  title =	 {Modeling and Visualizing Uncertainty in Gene
                  Expression Clusters Using {D}irichlet Process
                  Mixtures},
  journal =	 tcbb,
  volume =	 6,
  number =	 4,
  issn =	 {1545-5963},
  pages =	 {615--628},
  year =	 2009,
  url =		 {http://dx.doi.org/10.1109/TCBB.2007.70269},
  abstract =	 {Although the use of clustering methods has rapidly
                  become one of the standard computational approaches
                  in the literature of microarray gene expression
                  data, little attention has been paid to uncertainty
                  in the results obtained. Dirichlet process mixture
                  (DPM) models provide a nonparametric Bayesian
                  alternative to the bootstrap approach to modeling
                  uncertainty in gene expression clustering. Most
                  previously published applications of Bayesian
                  model-based clustering methods have been to short
                  time series data. In this paper, we present a case
                  study of the application of nonparametric Bayesian
                  clustering methods to the clustering of
                  high-dimensional nontime series gene expression data
                  using full Gaussian covariances. We use the
                  probability that two genes belong to the same
                  cluster in a DPM model as a measure of the
                  similarity of these gene expression
                  profiles. Conversely, this probability can be used
                  to define a dissimilarity measure, which, for the
                  purposes of visualization, can be input to one of
                  the standard linkage algorithms used for
                  hierarchical clustering. Biologically plausible
                  results are obtained from the Rosetta compendium of
                  expression profiles which extend previously
                  published cluster analyses of this data.},
  doi =		 {10.1109/TCBB.2007.70269}
}

@inproceedings{RasDei08,
  cat =		 {rl},
  title =	 {Probabilistic Inference for Fast Learning in
                  Control},
  pages =	 {229--242},
  booktitle =	 {Recent Advances in Reinforcement Learning},
  publisher =	 {Springer-Verlag},
  year =	 2008,
  editor =	 {S. Girgin and M. Loth and R. Munos and P. Preux and
                  D. Ryabko},
  author =	 {Carl Edward Rasmussen and Marc Peter Deisenroth},
  volume =	 5323,
  series =	 lncs,
  type =	 {Lecture Notes in Artificial Intelligence},
  month =	 {November},
  address =	 {Villeneuve d'Ascq, France},
  url =		 {.},
  abstract =	 {We provide a novel framework for very fast
                  model-based reinforcement learning in continuous
                  state and action spaces. The framework requires
                  probabilistic models that explicitly characterize
                  their levels of confidence. Within this framework,
                  we use flexible, non-parametric models to describe
                  the world based on previously collected experience.
                  We demonstrate learning on the cart-pole problem in
                  a setting where we provide very limited prior
                  knowledge about the task. Learning progresses
                  rapidly, and a good policy is found after only a
                  hand-full of iterations.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/carl/ewrl08/">videos
                  and more</a>. <a
                  href="http://mlg.eng.cam.ac.uk/marc/talks/2008-07-03-EWRL-lille.pdf">slides</a>.}
}

@inproceedings{RasGha01,
  cat =		 {review},
  author =	 {Carl Edward Rasmussen and Zoubin Ghahramani},
  title =	 {{O}ccam's Razor},
  booktitle =	 nips13,
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  editors =	 {T. Leen, T. G. Diettrich and V. Tresp},
  pages =	 {294--300},
  year =	 2001,
  month =	 {December},
  url =		 {.},
  abstract =	 {The Bayesian paradigm apparently only sometimes
                  gives rise to Occam's Razor; at other times very
                  large models perform well. We give simple examples
                  of both kinds of behaviour. The two views are
                  reconciled when measuring complexity of functions,
                  rather than of the machinery used to implement
                  them. We analyze the complexity of functions for
                  some linear in the parameter models that are
                  equivalent to Gaussian Processes, and always find
                  Occam's Razor at work.}
}

@inproceedings{RasGha02,
  cat =		 {gp np},
  author =	 {Carl Edward Rasmussen and Zoubin Ghahramani},
  title =	 {Infinite Mixtures of {G}aussian Process Experts},
  booktitle =	 nips14,
  year =	 2002,
  month =	 {December},
  editors =	 {Dietterich, T. G., Becker, S. and Ghahramani, Z.},
  pages =	 {881--888},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {We present an extension to the Mixture of Experts
                  (ME) model, where the individual experts are
                  Gaussian Process (GP) regression models. Using an
                  input-dependent adaptation of the Dirichlet Process,
                  we implement a gating network for an infinite number
                  of Experts. Inference in this model may be done
                  efficiently using a Markov Chain relying on Gibbs
                  sampling. The model allows the effective covariance
                  function to vary with the inputs, and may handle
                  large datasets --- thus potentially overcoming two
                  of the biggest hurdles with GP models. Simulations
                  show the viability of this approach.}
}

@inproceedings{RasGha03,
  cat =		 {gp mcmc},
  author =	 {Carl Edward Rasmussen and Zoubin Ghahramani},
  title =	 {Bayesian {Monte Carlo}},
  booktitle =	 nips15,
  pages =	 {489--496},
  year =	 2003,
  month =	 {December},
  address =	 {Cambridge, MA, USA},
  editor =	 {S.~Becker and S.~Thrun and K.~Obermayer},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {We investigate Bayesian alternatives to classical
                  Monte Carlo methods for evaluating
                  integrals. Bayesian Monte Carlo (BMC) allows the
                  incorporation of prior knowledge, such as smoothness
                  of the integrand, into the estimation. In a simple
                  problem we show that this outperforms any classical
                  importance sampling method. We also attempt more
                  challenging multidimensional integrals involved in
                  computing marginal likelihoods of statistical models
                  (a.k.a. partition functions and model evidences). We
                  find that Bayesian Monte Carlo outperformed Annealed
                  Importance Sampling, although for very high
                  dimensional problems or problems with massive
                  multimodality BMC may be less adequate. One
                  advantage of the Bayesian approach to Monte Carlo is
                  that samples can be drawn from any
                  distribution. This allows for the possibility of
                  active design of sample points so as to maximise
                  information gain.}
}

@inproceedings{RasKus04,
  cat =		 {rl},
  author =	 {Carl Edward Rasmussen and Malte Ku\ss},
  booktitle =	 nips16,
  editor =	 {S. Thrun and L.K. Saul and B. Sch\"olkopf},
  title =	 {Gaussian processes in reinforcement learning},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  year =	 2004,
  month =	 {December},
  pages =	 {751--759},
  url =		 {.},
  abstract =	 {We exploit some useful properties of Gaussian
                  process (GP) regression models for reinforcement
                  learning in continuous state spaces and discrete
                  time. We demonstrate how the GP model allows
                  evaluation of the value function in closed form. The
                  resulting policy iteration algorithm is demonstrated
                  on a simple problem with a two dimensional state
                  space. Further, we speculate that the intrinsic
                  ability of GP models to characterise distributions
                  of functions would allow the method to capture
                  entire distributions over future values instead of
                  merely their expectation, which has traditionally
                  been the focus of much of reinforcement learning.}
}

@manual{RasNeaHinetal96,
  author =	 {Carl Edward Rasmussen and Radford M.~Neal and
                  Geoffrey E.~Hinton and Drew van Camp and Mike Revow
                  and Zoubin Ghahramani and Rafal Kustra and Robert
                  Tibshirani},
  title =	 {The {DELVE} manual},
  pages =	 {1--108},
  year =	 1996,
  abstract =	 {DELVE -- Data for Evaluating Learning in Valid
                  Experiments -- is a collection of datasets from many
                  sources, an environment within which this data can
                  be used to assess the performance of methods for
                  learning relationships from data, and a repository
                  for the results of such experiments.},
  url =		 {.},
  annote =	 {The <a href="http://www.cs.toronto.edu/~delve">delve
                  website</a>.},
  organizaton =	 {University of Toronto}
}

@article{RasNic10,
  cat =		 {gp},
  author =	 {Carl Edward Rasmussen and Hannes Nickisch},
  title =	 {{Gaussian Processes for Machine Learning (GPML)
                  Toolbox}},
  journal =	 jmlr,
  volume =	 11,
  pages =	 {3011--3015},
  year =	 2010,
  month =	 {December},
  url =
                  {http://www.jmlr.org/papers/volume11/rasmussen10a/rasmussen10a.pdf},
  abstract =	 {The GPML toolbox provides a wide range of
                  functionality for Gaussian process (GP) inference
                  and prediction. GPs are specified by mean and
                  covariance functions; we offer a library of simple
                  mean and covariance functions and mechanisms to
                  compose more complex ones. Several likelihood
                  functions are supported including Gaussian and
                  heavy-tailed for regression as well as others
                  suitable for classification. Finally, a range of
                  inference methods is provided, including exact and
                  variational inference, Expectation Propagation, and
                  Laplace's method dealing with non-Gaussian
                  likelihoods and FITC for dealing with large
                  regression tasks.},
  annote =	 {Toolbox avaiable from <a
                  href="http://GaussianProcess.org/gpml/code">here</a>. Implements
                  algorithms from <a href="#RasWil06">Rasmussen and
                  Williams, 2006</a>.}
}

@inproceedings{RasQui05,
  cat =		 {gp},
  author =	 {Carl Edward Rasmussen and Joaquin
                  Qui{\~n}onero-Candela},
  title =	 {Healing the {R}elevance {V}ector {M}achine through
                  Augmentation},
  year =	 2005,
  pages =	 {689--696},
  booktitle =	 icml22,
  editor =	 {L.~De Raedt and S.~Wrobel},
  abstract =	 {The Relevance Vector Machine (RVM) is a sparse
                  approximate Bayesian kernel method. It provides full
                  predictive distributions for test cases. However,
                  the predictive uncertainties have the unintuitive
                  property, that \emph{they get smaller the further
                  you move away from the training cases}. We give a
                  thorough analysis. Inspired by the analogy to
                  non-degenerate Gaussian Processes, we suggest
                  augmentation to solve the problem. The purpose of
                  the resulting model, RVM*, is primarily to
                  corroborate the theoretical and experimental
                  analysis. Although RVM* could be used in practical
                  applications, it is no longer a truly sparse
                  model. Experiments show that sparsity comes at the
                  expense of worse predictive distributions.},
  location =	 {Bonn, Germany},
  url =		 {.}
}

@book{RasWil06,
  cat =		 {gp},
  author =	 {Carl Edward Rasmussen and Christopher
                  K.~I.~Williams},
  title =	 {Gaussian Processes for Machine Learning},
  publisher =	 mit,
  pages =	 272,
  year =	 2006,
  url =		 {.},
  abstract =	 {Gaussian processes (GPs) provide a principled,
                  practical, probabilistic approach to learning in
                  kernel machines. GPs have received increased
                  attention in the machine-learning community over the
                  past decade, and this book provides a long-needed
                  systematic and unified treatment of theoretical and
                  practical aspects of GPs in machine learning. The
                  treatment is comprehensive and self-contained,
                  targeted at researchers and students in machine
                  learning and applied statistics.},
  annote =	 {Winner of the 2009 <a
                  href="http://bayesian.org/awards/DeGrootPrize.html">DeGroot
                  Prize</a>. Book <a
                  href="http://www.gaussianprocess.org/gpml">web
                  page</a>, <a
                  href="//www.gaussianprocess.org/gpml/chapters">chapters</a>
                  and <a
                  href="http://www.gaussianprocess.org/gpml/chapters/RW.pdf">entire
                  book pdf</a>. <a href="#RasNic10">GPML Toolbox</a>.}
}

@article{RasWil93,
  author =	 {Carl Edward Rasmussen and David J. Willshaw},
  title =	 {Presynaptic and postsynaptic comptetition in models
                  for the development of neuromuscular connections},
  journal =	 {Biological Cybernetics},
  publisher =	 {Springer},
  volume =	 68,
  number =	 5,
  pages =	 {409--419},
  year =	 1993,
  url =		 {http://dx.doi.org/10.1007/BF00198773},
  abstract =	 {In the establishment of connections between nerve
                  and muscle there is an initial stage when each
                  muscle fibre is innervated by several different
                  motor axons. Withdrawal of connections then takes
                  place until each fibre has contact from just a
                  single axon. The evidence suggests that the
                  withdrawal process involves competition between
                  nerve terminals. We examine in formal models several
                  types of competitive mechanism that have been
                  proposed for this phenomenon. We show that a model
                  which combines competition for a presynaptic
                  resource with competition for a postsynaptic
                  resource is superior to others. This model accounts
                  for many anatomical and physiological findings and
                  has a biologically plausible
                  implementation. Intrinsic withdrawal appears to be a
                  side effect of the competitive mechanism rather than
                  a separate non-competitive feature. The model's
                  capabilities are confirmed by theoretical analysis
                  and full scale computer simulations.},
  doi =		 {10.1007/BF00198773}
}

@inproceedings{RotVanMooGha10,
  cat =		 {ssl},
  author =	 {Rotsos, C. and {Van Gael}, J. and Moore, A.W. and
                  Ghahramani, Z.},
  booktitle =	 {1st International Workshop on Traffic Analysis and
                  Classification (IWCMC '10)},
  keywords =	 {Machine Learning,Network,Semi Supervised},
  mendeley-tags ={Machine Learning,Network,Semi Supervised},
  title =	 {Traffic Classification in Information Poor
                  Environments},
  url =		 {.},
  year =	 2010,
  month =	 {July},
  address =	 {Caen, France},
  abstract =	 {Traffic classification using machine learning
                  continues to be an active research area. The
                  majority of work in this area uses
                  \emph{off-the-shelf} machine learning tools and
                  treats them as \emph{black-box} classifiers. This
                  approach turns all the modelling complexity into a
                  feature selection problem. In this paper, we build a
                  problem-specific solution to the traffic
                  classification problem by designing a custom
                  probabilistic graphical model. Graphical models are
                  a modular framework to design classifiers which
                  incorporate domain-specific knowledge.  More
                  specifically, our solution introduces
                  semi-supervised learning which means we learn from
                  both labelled and unlabelled traffic flows. We show
                  that our solution performs competitively compared to
                  previous approaches while using less data and
                  simpler features.}
}

@inproceedings{RotVanMooetal10,
  cat =		 {gm},
  author =	 {Charalampos Rotsos and Jurgen {Van Gael} and Andrew
                  W. Moore and Zoubin Ghahramani},
  year =	 2010,
  title =	 {Probabilistic Graphical Models for Semi-Supervised
                  Traffic Classification},
  booktitle =	 {The 6th International Wireless Communications and
                  Mobile Computing Conference},
  pages =	 {752--757},
  address =	 {Caen, France},
  url =		 {.},
  abstract =	 {Traffic classification using machine learning
                  continues to be an active research area. The
                  majority of work in this area uses off-the-shelf
                  machine learning tools and treats them as black-box
                  classifiers. This approach turns all the modelling
                  complexity into a feature selection problem. In this
                  paper, we build a problem-specific solution to the
                  traffic classification problem by designing a custom
                  probabilistic graphical model. Graphical models are
                  a modular framework to design classifiers which
                  incorporate domain-specific knowledge. More
                  specifically, our solution introduces
                  semi-supervised learning which means we learn from
                  both labelled and unlabelled traffic flows. We show
                  that our solution performs competitively compared to
                  previous approaches while using less data and
                  simpler features.}
}

@inproceedings{SaaTurRas10,
  cat =		 {gp time},
  author =	 {Yunus {Saat\c{c}i} and Ryan Turner and Carl Edward
                  Rasmussen},
  title =	 {Gaussian Process Change Point Models},
  booktitle =	 icml27,
  pages =	 {927--934},
  year =	 2010,
  month =	 {June},
  address =	 {Haifa, Israel},
  url =		 {.},
  abstract =	 {We combine Bayesian online change point detection
                  with Gaussian processes to create a nonparametric
                  time series model which can handle change
                  points. The model can be used to locate change
                  points in an online manner; and, unlike other
                  Bayesian online change point detection algorithms,
                  is applicable when temporal correlations in a regime
                  are expected. We show three variations on how to
                  apply Gaussian processes in the change point
                  context, each with their own advantages. We present
                  methods to reduce the computational burden of these
                  models and demonstrate it on several real world data
                  sets.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/rdturner/ICML2010poster.pdf">poster</a>,
                  <a
                  href="http://mlg.eng.cam.ac.uk/rdturner/ICML2010talk.pdf">slides</a>.}
}

@article{SavGhaGrietal10,
  cat =		 {bioinf},
  author =	 {R. S. Savage and Z. Ghahramani and J. E. Griffin and
                  B. de la Cruz and D. L. Wild},
  year =	 2010,
  title =	 {Discovering Transcriptional Modules by {Bayesian}
                  Data Integration},
  journal =	 {Bioinformatics},
  volume =	 26,
  pages =	 {i158--i167},
  url =		 {.},
  abstract =	 {Motivation: We present a method for directly
                  inferring transcriptional modules (TMs) by
                  integrating gene expression and transcription factor
                  binding (ChIP-chip) data. Our model extends a
                  hierarchical Dirichlet process mixture model to
                  allow data fusion on a gene-by-gene basis. This
                  encodes the intuition that co-expression and
                  co-regulation are not necessarily equivalent and
                  hence we do not expect all genes to group similarly
                  in both datasets. In particular, it allows us to
                  identify the subset of genes that share the same
                  structure of transcriptional modules in both
                  datasets.<br/>Results: We find that by working on a
                  gene-by-gene basis, our model is able to extract
                  clusters with greater functional coherence than
                  existing methods. By combining gene expression and
                  transcription factor binding (ChIP-chip) data in
                  this way, we are better able to determine the groups
                  of genes that are most likely to represent
                  underlying TMs.<br/>Availability: If interested in
                  the code for the work presented in this article,
                  please contact the authors.}
}

@article{SavHelXuetal09,
  cat =		 {clust bioinf},
  volume =	 10,
  number =	 242,
  pages =	 {1--9},
  month =	 {August},
  author =	 {R.~Savage and K.~A.~Heller and Y.~Xu and Zoubin
                  Ghahramani and W.~Truman and M.~Grant and K.~Denby
                  and D.~L.~Wild},
  title =	 {{R/BHC}: fast {B}ayesian hierarchical clustering for
                  microarray data},
  publisher =	 {BioMed Central},
  journal =	 {BMC Bioinformatics 2009},
  year =	 2009,
  url =		 {.},
  abstract =	 {Background: Although the use of clustering methods
                  has rapidly become one of the standard computational
                  approaches in the literature of microarray gene
                  expression data analysis, little attention has been
                  paid to uncertainty in the results obtained.<br/>
                  Results: We present an R/Bioconductor port of a fast
                  novel algorithm for Bayesian agglomerative
                  hierarchical clustering and demonstrate its use in
                  clustering gene expression microarray data. The
                  method performs bottom-up hierarchical clustering,
                  using a Dirichlet Process (infinite mixture) to
                  model uncertainty in the data and Bayesian model
                  selection to decide at each step which clusters to
                  merge.<br/> Conclusion: Biologically plausible
                  results are presented from a well studied data set:
                  expression profiles of \emph{A. thaliana} subjected
                  to a variety of biotic and abiotic stresses. Our
                  method avoids several limitations of traditional
                  methods, for example how many clusters there should
                  be and how to choose a principled distance metric.},
  doi =		 {10.1186/1471-2105-10-242},
  issn =	 {1471-2105},
  pubmedid =	 19660130
}

@inproceedings{Sch09,
  title =	 {Function factorization using warped {G}aussian
                  processes},
  author =	 {Mikkel N. Schmidt},
  booktitle =	 icml26,
  address =	 {Montr\'{e}al, QC, Canada},
  pages =	 {921--928},
  editor =	 {L\'{e}on Bottou and Michael Littman},
  month =	 {June},
  publisher =	 {Omnipress},
  year =	 2009,
  url =		 {.},
  abstract =	 {We introduce a new approach to non-linear regression
                  called function factorization, that is suitable for
                  problems where an output variable can reasonably be
                  modeled by a number of multiplicative interaction
                  terms between non-linear functions of the
                  inputs. The idea is to approximate a complicated
                  function on a high-dimensional space by the sum of
                  products of simpler functions on lower-dimensional
                  subspaces. Function factorization can be seen as a
                  generalization of matrix and tensor factorization
                  methods, in which the data are approximated by the
                  sum of outer products of vectors. We present a
                  non-parametric Bayesian approach to function
                  factorization where the priors over the factorizing
                  functions are warped Gaussian processes, and we do
                  inference using Hamiltonian Markov chain Monte
                  Carlo. We demonstrate the superior predictive
                  performance of the method on a food science data set
                  compared to Gaussian process regression and tensor
                  factorization using PARAFAC and GEMANOVA models.},
  annote =	 {<a
                  href="http://mikkelschmidt.dk/uploads/media/talk_icml2009_01.pdf">slides</a>. <a
                  href="http://mikkelschmidt.dk/uploads/media/poster_icml2009_01.pdf">poster</a>. <a
                  href="http://videolectures.net/icml09_schmidt_ffuw/">video</a>.}
}

@inproceedings{Sch09b,
  title =	 {Linearly constrained {B}ayesian matrix factorization
                  for blind source separation},
  author =	 {Mikkel N. Schmidt},
  booktitle =	 nips22,
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  editor =	 {Y. Bengio and D. Schuurmans and J. Lafferty and
                  C. K. I. Williams and A. Culotta},
  month =	 {December},
  pages =	 {1624--1632},
  year =	 2009,
  url =		 {.},
  abstract =	 {We present a general Bayesian approach to
                  probabilistic matrix factorization subject to linear
                  constraints. The approach is based on a Gaussian
                  observation model and Gaussian priors with bilinear
                  equality and inequality constraints. We present an
                  efficient Markov chain Monte Carlo inference
                  procedure based on Gibbs sampling. Special cases of
                  the proposed model are Bayesian formulations of
                  non-negative matrix factorization and factor
                  analysis. The method is evaluated on a blind source
                  separation problem. We demonstrate that our
                  algorithm can be used to extract meaningful and
                  interpretable features that are remarkably different
                  from features extracted using existing related
                  matrix factorization techniques.},
  annote =	 {<a
                  href="http://mikkelschmidt.dk/index.php?id=23&cHash=9da63395ee586d4767b1fd35337bd5dd&tx_ttnews[tt_news]=47&tx_ttnews[backPid]=2">code</a>.}
}

@inproceedings{SchMoh09,
  title =	 {Probabilistic non-negative tensor factorization
                  using {M}arkov chain {M}onte {C}arlo},
  author =	 {Mikkel N. Schmidt and Shakir Mohamed},
  booktitle =	 {European Signal Processing Conference (EUSIPCO)},
  address =	 {Glasgow, Scotland},
  month =	 {August},
  year =	 2009,
  pages =	 {1918--1922},
  url =		 {.},
  abstract =	 {We present a probabilistic model for learning
                  non-negative tensor factorizations (NTF), in which
                  the tensor factors are latent variables associated
                  with each data dimension. The non-negativity
                  constraint for the latent factors is handled by
                  choosing priors with support on the non-negative
                  numbers. Two Bayesian inference procedures based on
                  Markov chain Monte Carlo sampling are described:
                  Gibbs sampling and Hamiltonian Markov chain Monte
                  Carlo. We evaluate the model on two food science
                  data sets, and show that the probabilistic NTF model
                  leads to better predictions and avoids overfitting
                  compared to existing NTF approaches.},
  annote =	 {Rated by reviewers amongst the top 5% of the
                  presented papers.}
}

@article{SchVenKnoetal11,
  cat =		 {bioinf},
  author =	 {Cornelia Schone and Anne Venner and David A. Knowles
                  and Mahesh M Karnani and Denis Burdakov},
  title =	 {Dichotomous cellular properties of mouse
                  orexin/hypocretin neurons},
  url =
                  {http://jp.physoc.org/content/early/2011/04/11/jphysiol.2011.208637.abstract},
  journal =	 {The Journal of Physiology},
  year =	 2011,
  abstract =	 {Hypothalamic hypocretin/orexin (hcrt/orx) neurons
                  recently emerged as critical regulators of
                  sleep-wake cycles, reward-seeking, and body energy
                  balance. However, at the level of cellular and
                  network properties, it remains unclear whether
                  hcrt/orx neurons are one homogenous population, or
                  whether there are several distinct types of hcrt/orx
                  cells. Here, we collated diverse structural and
                  functional information about individual hcrt/orx
                  neurons in mouse brain slices, by combining
                  patch-clamp analysis of spike firing, membrane
                  currents, and synaptic inputs with confocal imaging
                  of cell shape and subsequent 3-dimensional Sholl
                  analysis of dendritic architecture. Statistical
                  cluster analysis of intrinsic firing properties
                  revealed that hcrt/orx neurons fall into two
                  distinct types. These two cell types also differ in
                  the complexity of their dendritic arbour, the
                  strength of AMPA and GABAA receptor-mediated
                  synaptic drive that they receive, and the density of
                  low-threshold, 4-aminopyridine-sensitive, transient
                  K+ current. Our results provide quantitative
                  evidence that, at the cellular level, the mouse
                  hcrt/orx system is composed of two classes of
                  neurons with different firing properties,
                  morphologies, and synaptic input organization.}
}

@inproceedings{SchWinHan09,
  title =	 {Bayesian non-negative matrix factorization},
  author =	 {Mikkel N. Schmidt and Ole Winther and Lars Kai
                  Hansen},
  booktitle =	 {8th International Conference on Independent
                  Component Analysis and Signal Separation},
  pages =	 {540--547},
  publisher =	 {Springer},
  address =	 {Paraty, Brazil},
  series =	 lncs,
  volume =	 5441,
  year =	 2009,
  month =	 {March},
  url =		 {.},
  abstract =	 {We present a Bayesian treatment of non-negative
                  matrix factorization (NMF), based on a normal
                  likelihood and exponential priors, and derive an
                  efficient Gibbs sampler to approximate the posterior
                  density of the NMF factors. On a chemical brain
                  imaging data set, we show that this improves
                  interpretability by providing uncertainty
                  estimates. We discuss how the Gibbs sampler can be
                  used for model order selection by estimating the
                  marginal likelihood, and compare with the Bayesian
                  information criterion. For computing the maximum a
                  posteriori estimate we present an iterated
                  conditional modes algorithm that rivals existing
                  state-of-the-art NMF algorithms on an image feature
                  extraction problem.},
  annote =	 {<a
                  href="http://mikkelschmidt.dk/uploads/media/talk_ica2009.pdf">slides</a>. <a
                  href="http://mikkelschmidt.dk/uploads/media/bayesnmf_01.zip">code</a>.}
}

@inproceedings{SilChuGha08,
  cat =		 {gm},
  booktitle =	 nips20,
  month =	 {December},
  title =	 {Hidden common cause relations in relational
                  learning},
  author =	 {R.~Silva and W.~Chu and Zoubin Ghahramani},
  year =	 2008,
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  editor =	 {J.~C.~Platt and D.~Koller and Y.~Singer and
                  S.~Roweis},
  pages =	 {1345--1352},
  url =		 {.},
  abstract =	 {When predicting class labels for objects within a
                  relational database, it is often helpful to consider
                  a model for relationships: this allows for
                  information between class labels to be shared and to
                  improve prediction performance. However, there are
                  different ways by which objects can be related
                  within a relational database.  One traditional way
                  corresponds to a Markov network structure: each
                  existing relation is represented by an undirected
                  edge. This encodes that, conditioned on input
                  features, each object label is independent of other
                  object labels given its neighbors in the
                  graph. However, there is no reason why Markov
                  networks should be the only representation of choice
                  for symmetric dependence structures. Here we discuss
                  the case when relationships are postulated to exist
                  due to \emph{hidden common causes}. We discuss how
                  the resulting graphical model differs from Markov
                  networks, and how it describes different types of
                  real-world relational processes.  A Bayesian
                  nonparametric classification model is built upon
                  this graphical representation and evaluated with
                  several empirical studies.},
  annote =	 {Code at <a
                  href="http://www.homepages.ucl.ac.uk/~ucgtrbd/code/xgp">http://www.homepages.ucl.ac.uk/~ucgtrbd/code/xgp</a>}
}

@article{SilGha09,
  cat =		 {gm},
  volume =	 10,
  month =	 {June},
  author =	 {R. Silva and Z. Ghahramani},
  title =	 {The hidden life of latent variables: {Bayesian}
                  learning with mixed graph models},
  publisher =	 {Association for Computing Machinery},
  journal =	 jmlr,
  pages =	 {1187--1238},
  year =	 2009,
  url =		 {.},
  abstract =	 {Directed acyclic graphs (DAGs) have been widely used
                  as a representation of conditional independence in
                  machine learning and statistics. Moreover, hidden or
                  latent variables are often an important component of
                  graphical models. However, DAG models suffer from an
                  important limitation: the family of DAGs is not
                  closed under marginalization of hidden
                  variables. This means that in general we cannot use
                  a DAG to represent the independencies over a subset
                  of variables in a larger DAG. Directed mixed graphs
                  (DMGs) are a representation that includes DAGs as a
                  special case, and overcomes this limitation. This
                  paper introduces algorithms for performing Bayesian
                  inference in Gaussian and probit DMG models. An
                  important requirement for inference is the
                  specification of the distribution over parameters of
                  the models. We introduce a new distribution for
                  covariance matrices of Gaussian DMGs. We discuss and
                  illustrate how several Bayesian machine learning
                  tasks can benefit from the principle presented here:
                  the power to model dependencies that are generated
                  from hidden variables, but without necessarily
                  modeling such variables explicitly.}
}

@inproceedings{SilGha09b,
  cat =		 {gm},
  volume =	 5,
  author =	 {R. Silva and Z. Ghahramani},
  annote =	 {Code at <a
                  href="http://www.homepages.ucl.ac.uk/~ucgtrbd/code/fmog-version0.zip">http://www.homepages.ucl.ac.uk/~ucgtrbd/code/fmog-version0.zip</a>},
  note =	 {ISSN: 1938-7228},
  booktitle =	 aistats12,
  title =	 {Factorial mixture of {Gaussians} and the marginal
                  independence model},
  publisher =	 jmlr,
  pages =	 {520--527},
  year =	 2009,
  month =	 {April},
  address =	 {Clearwater Beach, FL, USA},
  url =		 {.},
  abstract =	 {Marginal independence constraints play an important
                  role in learning with graphical models. One way of
                  parameterizing a model of marginal independencies is
                  by building a latent variable model where two
                  independent observed variables have no common latent
                  source. In sparse domains, however, it might be
                  advantageous to model the marginal observed
                  distribution directly, without explicitly including
                  latent variables in the model.  There have been
                  recent advances in Gaussian and binary models of
                  marginal independence, but no models with non-linear
                  dependencies between continuous variables has been
                  proposed so far. In this paper, we describe how to
                  generalize the Gaussian model of marginal
                  independencies based on mixtures, and how to learn
                  parameters. This requires a non-standard
                  parameterization and raises difficult non-linear
                  optimization issues.}
}

@article{SilHelGhaetal10,
  cat =		 {ir bioinf},
  author =	 {R. Silva and K. A. Heller and Z. Ghahramani and
                  E. M. Airoldi},
  year =	 2010,
  pages =	 {615--644},
  volume =	 4,
  number =	 2,
  title =	 {Ranking Relations Using Analogies in Biological and
                  Information Networks},
  journal =	 {Annals of Applied Statistics},
  url =		 {.},
  abstract =	 {Analogical reasoning depends fundamentally on the
                  ability to learn and generalize about relations
                  between objects. We develop an approach to
                  relational learning which, given a set of pairs of
                  objects S = {A(1):B(1), A(2):B(2), ..., A(N):B(N)},
                  measures how well other pairs A:B fit in with the
                  set S. Our work addresses the question: is the
                  relation between objects A and B analogous to those
                  relations found in S? Such questions are
                  particularly relevant in information retrieval,
                  where an investigator might want to search for
                  analogous pairs of objects that match the query set
                  of interest. There are many ways in which objects
                  can be related, making the task of measuring
                  analogies very challenging. Our approach combines a
                  similarity measure on function spaces with Bayesian
                  analysis to produce a ranking.  It requires data
                  containing features of the objects of interest and a
                  link matrix specifying which relationships exist; no
                  further attributes of such relationships are
                  necessary. We illustrate the potential of our method
                  on text analysis and information networks. An
                  application on discovering functional interactions
                  between pairs of proteins is discussed in detail,
                  where we show that our approach can work in practice
                  even if a small set of protein pairs is provided.}
}

@inproceedings{SinQuiBaketal04,
  cat =		 {gp},
  author =	 {Fabian Sinz and Joaquin Qui{\~n}onero-Candela and
                  G{\"o}khan H.~Bakir and Carl Edward Rasmussen and
                  Matthias O.~Franz},
  title =	 {Learning Depth From Stereo},
  year =	 2004,
  series =	 lncs,
  volume =	 3175,
  publisher =	 {Springer},
  pages =	 {245--252},
  month =	 09,
  journal =	 {Pattern Recognition: Proceedings of the 26th DAGM
                  Symposium},
  editor =	 {C.~E.~Rasmussen and H.~H.~B{\"u}lthoff and
                  B.~Sch{\"o}lkopf and M.~A.~Giese},
  address =	 {Berlin, Germany},
  abstract =	 {We compare two approaches to the problem of
                  estimating the depth of a point in space from
                  observing its image position in two different
                  cameras: 1.~The classical photogrammetric approach
                  explicitly models the two cameras and estimates
                  their intrinsic and extrinsic parameters using a
                  tedious calibration procedure; 2.~A generic machine
                  learning approach where the mapping from image to
                  spatial coordinates is directly approximated by a
                  Gaussian Process regression. Our results show that
                  the generic learning approach, in addition to
                  simplifying the procedure of calibration, can lead
                  to higher depth accuracies than classical
                  calibration although no specific domain knowledge is
                  used.},
  booktitle =	 {26th DAGM Symposium},
  location =	 {T{\"u}bingen, Germany},
  url =		 {.}
}

@InProceedings{SneGha05,
  cat =		 {approx},
  author =	 {Edward Snelson and Zoubin Ghahramani},
  title =	 {Compact Approximations to {B}ayesian Predictive
                  Distributions},
  booktitle =	 icml22,
  year =	 2005,
  address =	 {Bonn, Germany},
  month =	 {August},
  URL =		 {http://www.gatsby.ucl.ac.uk/~snelson/Bayes_pred.pdf},
  publisher =	 {Omnipress},
  abstract =	 {We provide a general framework for learning precise,
                  compact, and fast representations of the Bayesian
                  predictive distribution for a model. This framework
                  is based on minimizing the KL divergence between the
                  true predictive density and a suitable compact
                  approximation. We consider various methods for doing
                  this, both sampling based approximations, and
                  deterministic approximations such as expectation
                  propagation. These methods are tested on a mixture
                  of Gaussians model for density estimation and on
                  binary linear classification, with both synthetic
                  data sets for visualization and several real data
                  sets. Our results show significant reductions in
                  prediction time and memory footprint.}
}

@InCollection{SneGha06,
  cat =		 {gp},
  title =	 {Sparse {G}aussian Processes using Pseudo-inputs},
  author =	 {Edward Snelson and Zoubin Ghahramani},
  booktitle =	 nips18,
  editor =	 {Y. Weiss and B. Sch\"{o}lkopf and J. Platt},
  publisher =	 MIT,
  address =	 {Cambridge, MA},
  pages =	 {1257--1264},
  year =	 2006,
  url =		 {http://www.gatsby.ucl.ac.uk/~snelson/SPGP_up.pdf},
  abstract =	 {We present a new Gaussian process (GP) regression
                  model whose covariance is parameterized by the the
                  locations of M pseudo-input points, which we learn
                  by a gradient based optimization. We take
                  M&lt;&lt;N, where N is the number of real data
                  points, and hence obtain a sparse regression method
                  which has O(NM<sup>2</sup>) training cost and
                  O(M<sup>2</sup>) prediction cost per test case. We
                  also find hyperparameters of the covariance function
                  in the same joint optimization. The method can be
                  viewed as a Bayesian regression model with
                  particular input dependent noise. The method turns
                  out to be closely related to several other sparse GP
                  approaches, and we discuss the relation in
                  detail. We finally demonstrate its performance on
                  some large data sets, and make a direct comparison
                  to other sparse GP methods. We show that our method
                  can match full GP performance with small M,
                  i.e. very sparse solutions, and it significantly
                  outperforms other approaches in this regime.}
}

@InProceedings{SneGha06b,
  cat =		 {gp},
  author =	 {Edward Snelson and Zoubin Ghahramani},
  title =	 {Variable noise and dimensionality reduction for
                  sparse {G}aussian processes},
  booktitle =	 uai22,
  editor =	 {R. Dechter and T. S. Richardson},
  publisher =	 {AUAI Press},
  year =	 2006,
  url =
                  {http://www.gatsby.ucl.ac.uk/~snelson/snelson_uai.pdf},
  abstract =	 {The sparse pseudo-input {G}aussian process (SPGP) is
                  a new approximation method for speeding up GP
                  regression in the case of a large number of data
                  points N. The approximation is controlled by the
                  gradient optimization of a small set of M
                  pseudo-inputs, thereby reducing complexity from
                  O(N<sup>3</sup>) to O(NM<sup>2</sup>). One
                  limitation of the SPGP is that this optimization
                  space becomes impractically big for high dimensional
                  data sets. This paper addresses this limitation by
                  performing automatic dimensionality reduction. A
                  projection of the input space to a low dimensional
                  space is learned in a supervised manner, alongside
                  the pseudo-inputs, which now live in this reduced
                  space. The paper also investigates the suitability
                  of the SPGP for modeling data with input-dependent
                  noise. A further extension of the model is made to
                  make it even more powerful in this regard - we learn
                  an uncertainty parameter for each pseudo-input. The
                  combination of sparsity, reduced dimension, and
                  input-dependent noise makes it possible to apply GPs
                  to much larger and more complex data sets than was
                  previously practical. We demonstrate the benefits of
                  these methods on several synthetic and real world
                  problems.}
}

@InProceedings{SneGha07,
  cat =		 {gp},
  author =	 {Edward Snelson and Zoubin Ghahramani},
  title =	 {Local and global sparse {G}aussian process
                  approximations},
  booktitle =	 aistats11,
  editor =	 {M. Meila and X. Shen},
  publisher =	 {Omnipress},
  year =	 2007,
  url =		 {http://www.gatsby.ucl.ac.uk/~snelson/localGP.pdf},
  abstract =	 {{G}aussian process (GP) models are flexible
                  probabilistic nonparametric models for regression,
                  classification and other tasks. Unfortunately they
                  suffer from computational intractability for large
                  data sets. Over the past decade there have been many
                  different approximations developed to reduce this
                  cost. Most of these can be termed global
                  approximations, in that they try to summarize all
                  the training data via a small set of support
                  points. A different approach is that of local
                  regression, where many local experts account for
                  their own part of space. In this paper we start by
                  investigating the regimes in which these different
                  approaches work well or fail. We then proceed to
                  develop a new sparse GP approximation which is a
                  combination of both the global and local
                  approaches. Theoretically we show that it is derived
                  as a natural extension of the framework developed by
                  <a
                  href="http://mlg.eng.cam.ac.uk/pub/#QuiRas05">Qui{\~n}onero-Candela
                  and Rasmussen</a> for sparse GP approximations. We
                  demonstrate the benefits of the combined
                  approximation on some 1D examples for illustration,
                  and on some large real-world data sets.}
}

@inproceedings{SneRasGha04,
  cat =		 {gp},
  author =	 {Edward Snelson and Carl Edward Rasmussen and Zoubin
                  Ghahramani},
  title =	 {Warped {G}aussian Processes},
  booktitle =	 nips16,
  pages =	 {337--344},
  year =	 2004,
  month =	 {December},
  editor =	 {S.~Thrun and L.~Saul and B.~Sch{\"o}lkopf},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  isbn =	 {0-262-20152-6},
  url =		 {.},
  abstract =	 {We generalise the Gaussian process (GP) framework
                  for regression by learning a nonlinear
                  transformation of the GP outputs. This allows for
                  non-Gaussian processes and non-Gaussian noise. The
                  learning algorithm chooses a nonlinear
                  transformation such that transformed data is
                  well-modelled by a GP. This can be seen as including
                  a preprocessing transformation as an integral part
                  of the probabilistic modelling problem, rather than
                  as an ad-hoc step. We demonstrate on several real
                  regression problems that learning the transformation
                  can lead to significantly better performance than
                  using a regular GP, or a GP with a fixed
                  transformation.}
}

@inproceedings{SolMurLeietal03,
  cat =		 {gp},
  author =	 {Ercan Solak and Roderick Murray-Smith and William
                  E.~Leithead and Douglas Leith and Carl Edward
                  Rasmussen},
  title =	 {Derivative observations in {G}aussian Process models
                  of dynamic systems},
  booktitle =	 nips15,
  pages =	 {1033--1040},
  month =	 {December},
  year =	 2003,
  address =	 {Cambridge, MA, USA},
  editor =	 {S.~Becker and S.~Thrun and K.~Obermayer},
  publisher =	 mit,
  url =		 {.},
  abstract =	 {Gaussian processes provide an approach to
                  nonparametric modelling which allows a
                  straightforward combination of function and
                  derivative observations in an empirical model. This
                  is of particular importance in identification of
                  nonlinear dynamic systems from experimental data. 1)
                  It allows us to combine derivative information, and
                  associated uncertainty with normal function
                  observations into the learning and inference
                  process.  This derivative information can be in the
                  form of priors specified by an expert or identified
                  from perturbation data close to equilibrium. 2) It
                  allows a seamless fusion of multiple local linear
                  models in a consistent manner, inferring consistent
                  models and ensuring that integrability constraints
                  are met. 3) It improves dramatically the
                  computational efficiency of Gaussian process models
                  for dynamic system identification, by summarising
                  large quantities of near-equilibrium data by a
                  handful of linearisations, reducing the training set
                  size - traditionally a problem for Gaussian process
                  models.}
}

@article{SonBraOngetal07,
  cat =		 {review},
  author =	 {S{\"o}ren Sonnenburg and Mikio L.~Braun and Cheng
                  Soon Ong and Samy Bengio and Leon Bottou and
                  Geoffrey Holmes and Yann LeCun and Klaus-Robert
                  M{\"u}ller and Fernando Pereira and Carl Edward
                  Rasmussen and Gunnar R{\"a}tsch and Bernhard
                  Sch{\"o}lkopf and Alexander Smola and Pascal Vincent
                  and Jason Weston and Robert C.~Williamson},
  title =	 {The Need for Open Source Software in Machine
                  Learning},
  year =	 2007,
  volume =	 8,
  pages =	 {2443--2466},
  month =	 {October},
  journal =	 jmlr,
  abstract =	 {Open source tools have recently reached a level of
                  maturity which makes them suitable for building
                  large-scale real-world systems. At the same time,
                  the field of machine learning has developed a large
                  body of powerful learning algorithms for diverse
                  applications. However, the true potential of these
                  methods is not realized, since existing
                  implementations are not openly shared, resulting in
                  software with low usability, and weak
                  interoperability. We argue that this situation can
                  be significantly improved by increasing incentives
                  for researchers to publish their software under an
                  open source model. Additionally, we outline the
                  problems authors are faced with when trying to
                  publish algorithmic implementations of machine
                  learning methods. We believe that a resource of peer
                  reviewed software accompanied by short articles
                  would be highly valuable to both the machine
                  learning and the general scientific community.},
  url =
                  {http://www.jmlr.org/papers/volume8/sonnenburg07a/sonnenburg07a.pdf}
}

@article{SteDenCooetal10,
  cat =		 {bioinf},
  author =	 {O. Stegle and K. J. Denby and E. J. Cooke and
                  D. L. Wild and Z. Ghahramani and K. M. Borgwardt},
  year =	 2010,
  title =	 {A robust {Bayesian} two-sample test for detecting
                  intervals of differential gene expression in
                  microarray time series},
  journal =	 {Journal of Computational Biology},
  volume =	 17,
  number =	 3,
  pages =	 {1--13},
  doi =		 {10.1089/cmb.2009.0175},
  url =
                  {http://www.liebertonline.com/doi/abs/10.1089/cmb.2009.0175},
  abstract =	 {Understanding the regulatory mechanisms that are
                  responsible for an organism's response to
                  environmental change is an important issue in
                  molecular biology. A first and important step
                  towards this goal is to detect genes whose
                  expression levels are affected by altered external
                  conditions. A range of methods to test for
                  differential gene expression, both in static as well
                  as in time-course experiments, have been
                  proposed. While these tests answer the question
                  \emph{whether} a gene is differentially expressed,
                  they do not explicitly address the question
                  \emph{when} a gene is differentially expressed,
                  although this information may provide insights into
                  the course and causal structure of regulatory
                  programs. In this article, we propose a twosample
                  test for identifying intervals of differential gene
                  expression in microarray time series.  Our approach
                  is based on Gaussian process regression, can deal
                  with arbitrary numbers of replicates, and is robust
                  with respect to outliers. We apply our algorithm to
                  study the response of \emph{Arabidopsis thaliana}
                  genes to an infection by a fungal pathogen using a
                  microarray time series dataset covering 30,336 gene
                  probes at 24 observed time points. In classification
                  experiments, our test compares favorably with
                  existing methods and provides additional insights
                  into time-dependent differential expression.}
}

@inproceedings{SteDenMcHetal09,
  cat =		 {bioinf time},
  booktitle =	 {German Conference on Bioinformatics},
  title =	 {Discovering temporal patterns of differential gene
                  expression in microarray time series},
  author =	 {O. Stegle and K. Denby and S. McHattie and A. Meade
                  and D. Wild and Z. Ghahramani and K Borgwardt},
  year =	 2009,
  month =	 {September},
  address =	 {Halle, Germany},
  pages =	 {133--142},
  url =		 {.},
  abstract =	 {A wealth of time series of microarray measurements
                  have become available over recent years. Several
                  two-sample tests for detecting differential gene
                  expression in these time series have been defined,
                  but they can only answer the question \emph{whether}
                  a gene is differentially expressed across the whole
                  time series, not \emph{in which intervals} it is
                  differentially expressed. In this article, we
                  propose a Gaussian process based approach for
                  studying these dynamics of differential gene
                  expression. In experiments on \emph{Arabidopsis
                  thaliana} gene expression levels, our novel
                  technique helps us to uncover that the family of
                  WRKY transcription factors appears to be involved in
                  the early response to infection by a fungal
                  pathogen.}
}

@inproceedings{SteDenWiletal09,
  cat =		 {bioinf},
  booktitle =	 {13th Annual International Conference on Research in
                  Computational Molecular Biology (RECOMB 2009)},
  title =	 {A robust {Bayesian} two-sample test for detecting
                  intervals of differential gene expression in
                  microarray time series},
  author =	 {O.~Stegle and K.~Denby and David L.~Wild and Zoubin
                  Ghahramani and Karsten Borgwardt},
  year =	 2009,
  pages =	 {201--216},
  address =	 {Tucson, AZ, USA},
  publisher =	 {Springer-Verlag},
  series =	 {Lecture Notes in Bioinformatics},
  volume =	 5541,
  isbn =	 {978-3-642-02007-0},
  doi =		 {10.1007/978-3-642-02008-7_14},
  url =		 {.},
  abstract =	 {Understanding the regulatory mechanisms that are
                  responsible for an organism's response to
                  environmental changes is an important question in
                  molecular biology. A first and important step
                  towards this goal is to detect genes whose
                  expression levels are affected by altered external
                  conditions. A range of methods to test for
                  differential gene expression, both in static as well
                  as in time-course experiments, have been
                  proposed. While these tests answer the question
                  \emph{whether} a gene is differentially expressed,
                  they do not explicitly address the question
                  \emph{when} a gene is differentially expressed,
                  although this information may provide insights into
                  the course and causal structure of regulatory
                  programs.  In this article, we propose a two-sample
                  test for identifying \emph{intervals} of
                  differential gene expression in microarray time
                  series. Our approach is based on Gaussian process
                  regression, can deal with arbitrary numbers of
                  replicates and is robust with respect to
                  outliers. We apply our algorithm to study the
                  response of \emph{Arabidopsis thaliana} genes to an
                  infection by a fungal pathogen using a microarray
                  time series dataset covering 30,336 gene probes at
                  24 time points. In classification experiments our
                  test compares favorably with existing methods and
                  provides additional insights into time-dependent
                  differential expression.}
}

@inproceedings{SteGhaGoretal09,
  cat =		 {np time},
  volume =	 5,
  month =	 {April},
  author =	 {T. Stepleton and Z. Ghahramani and G.~Gordon and
                  T.-S.~Lee},
  note =	 {ISSN 1938-7228},
  booktitle =	 aistats12,
  editor =	 {D. van Dyk and M. Welling},
  title =	 {The block diagonal infinite hidden {M}arkov model},
  publisher =	 {Microtome Publishing (paper) Journal of Machine
                  Learning Research},
  year =	 2009,
  pages =	 {552--559},
  address =	 {Clearwater Beach, FL, USA},
  url =		 {.},
  abstract =	 {The Infinite Hidden Markov Model (IHMM) extends
                  hidden Markov models to have a countably infinite
                  number of hidden states (<a href="#BeaGhaRas02">Beal
                  et al., 2002</a>; Teh et al., 2006). We present a
                  generalization of this framework that introduces
                  nearly block-diagonal structure in the transitions
                  between the hidden states, where blocks correspond
                  to "sub-behaviors" exhibited by data sequences. In
                  identifying such structure, the model classifies, or
                  partitions, sequence data according to these
                  sub-behaviors in an unsupervised way. We present an
                  application of this model to artificial data, a
                  video gesture classification task, and a musical
                  theme labeling task, and show that components of the
                  model can also be applied to graph segmentation.}
}

@article{SunGhaBan08,
  cat =		 {approx},
  volume =	 30,
  number =	 12,
  month =	 {November},
  author =	 {J.M. Sung and Z. Ghahramani and S.Y. Bang},
  title =	 {Latent space variational {Bayes}},
  publisher =	 {IEEE},
  year =	 2008,
  journal =	 {IEEE Transactions on Pattern Analysis and Machine
                  Intelligence},
  pages =	 {2236--2242},
  url =		 {.},
  abstract =	 {Variational Bayesian Expectation-Maximization
                  (VBEM), an approximate inference method for
                  probabilistic models based on factorizing over
                  latent variables and model parameters, has been a
                  standard technique for practical Bayesian
                  inference. In this paper, we introduce a more
                  general approximate inference framework for
                  conjugate-exponential family models, which we call
                  Latent-Space Variational Bayes (LSVB). In this
                  approach, we integrate out the model parameters in
                  an exact way, leaving only the latent variables. It
                  can be shown that the LSVB approach gives better
                  estimates of the model evidence as well as the
                  distribution over the latent variables than the VBEM
                  approach, but, in practice, the distribution over
                  the latent variables has to be approximated. As a
                  practical implementation, we present a First-order
                  LSVB (FoLSVB) algorithm to approximate the
                  distribution over the latent variables. From this
                  approximate distribution, one can also estimate the
                  model evidence and the posterior over the model
                  parameters. The FoLSVB algorithm is directly
                  comparable to the VBEM algorithm and has the same
                  computational complexity. We discuss how LSVB
                  generalizes the recently proposed collapsed
                  variational methods to general conjugate-exponential
                  families. Examples based on mixtures of Gaussians
                  and mixtures of Bernoullis with synthetic and
                  real-world data sets are used to illustrate some
                  advantages of our method over VBEM.}
}

@article{SunGhaBan08b,
  cat =		 {approx},
  volume =	 15,
  month =	 {December},
  author =	 {J.M. Sung and Z. Ghahramani and S.Y. Bang},
  title =	 {Second-order latent space variational {Bayes} for
                  approximate {B}ayesian inference},
  publisher =	 {IEEE},
  journal =	 {IEEE Signal Processing Letters},
  pages =	 {918--921},
  year =	 2008,
  url =		 {.},
  abstract =	 {In this letter, we consider a variational
                  approximate Bayesian inference framework,
                  latent-space variational Bayes (LSVB), in the
                  general context of conjugate-exponential family
                  models with latent variables. In the LSVB approach,
                  we integrate out model parameters in an exact way
                  and then perform the variational inference over only
                  the latent variables. It can be shown that LSVB can
                  achieve better estimates of the model evidence as
                  well as the distribution over the latent variables
                  than the popular variational Bayesian
                  expectation-maximization (VBEM). However, the
                  distribution over the latent variables in LSVB has
                  to be approximated in practice. As an approximate
                  implementation of LSVB, we propose a second-order
                  LSVB (SoLSVB) method. In particular, VBEM can be
                  derived as a special case of a first-order
                  approximation in LSVB. SoLSVB can capture higher
                  order statistics neglected in VBEM and can therefore
                  achieve a better approximation. Examples of Gaussian
                  mixture models are used to illustrate the comparison
                  between our method and VBEM, demonstrating the
                  improvement.}
}

@inproceedings{TurBotGha10,
  cat =		 {time},
  author =	 {Ryan Turner and Steven Bottone and Zoubin
                  Ghahramani},
  title =	 {Fast Online Anomaly Detection Using Scan Statistics},
  booktitle =	 {Machine Learning for Signal Processing (MLSP 2010)},
  year =	 2010,
  address =	 {Kittil\"{a}, Finland},
  month =	 {August},
  pages =	 {385--390},
  isbn =	 {978-1-4244-7876-7},
  editor =	 {Samuel Kaski and David J. Miller and Erkki Oja and
                  Antti Honkela},
  abstract =	 {We present methods to do fast online anomaly
                  detection using scan statistics. Scan statistics
                  have long been used to detect statistically
                  significant bursts of events. We extend the scan
                  statistics framework to handle many practical issues
                  that occur in application: dealing with an unknown
                  background rate of events, allowing for slow natural
                  changes in background frequency, the inverse problem
                  of finding an unusual lack of events, and setting
                  the test parameters to maximize power. We
                  demonstrate its use on real and synthetic data sets
                  with comparison to other methods.},
  url =		 {.}
}

@inproceedings{TurDeiRas09,
  cat =		 {gp time},
  author =	 {Ryan Turner and Marc Peter Deisenroth and Carl
                  Edward Rasmussen},
  title =	 {System Identification in {G}aussian Process
                  Dynamical Systems},
  booktitle =	 {NIPS Workshop on Nonparametric Bayes},
  year =	 2009,
  editor =	 {Dilan G\"or\"ur},
  address =	 {Whistler, BC, Canada},
  month =	 {December},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/rdturner/NLDSposter.pdf">poster</a>.}
}

@inproceedings{TurDeiRas10,
  cat =		 {gp time},
  author =	 {Ryan Turner and Marc Peter Deisenroth and Carl
                  Edward Rasmussen},
  title =	 {State-Space Inference and Learning with {G}aussian
                  Processes},
  booktitle =	 aistats13,
  year =	 2010,
  editor =	 {Yee Whye Teh and Mike Titterington},
  volume =	 9,
  series =	 {W\&CP},
  pages =	 {868--875},
  address =	 {Chia Laguna, Sardinia, Italy},
  month =	 {May 13--15},
  organization = jmlr,
  abstract =	 {State-space inference and learning with Gaussian
                  processes (GPs) is an unsolved problem. We propose a
                  new, general methodology for inference and learning
                  in nonlinear state-space models that are described
                  probabilistically by non-parametric GP models. We
                  apply the expectation maximization algorithm to
                  iterate between inference in the latent state-space
                  and learning the parameters of the underlying GP
                  dynamics model.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/rdturner/NLDSposterAISTATS.pdf">poster</a>.}
}

@inproceedings{TurRas10,
  cat =		 {gp time},
  author =	 {Ryan Turner and Carl Edward Rasmussen},
  title =	 {Model Based Learning of Sigma Points in Unscented
                  {K}alman Filtering},
  booktitle =	 {Machine Learning for Signal Processing (MLSP 2010)},
  year =	 2010,
  address =	 {Kittil\"{a}, Finland},
  pages =	 {178--183},
  month =	 {August},
  isbn =	 {978-1-4244-7876-7},
  editor =	 {Samuel Kaski and David J. Miller and Erkki Oja and
                  Antti Honkela},
  abstract =	 {The unscented Kalman filter (UKF) is a widely used
                  method in control and time series applications. The
                  UKF suffers from arbitrary parameters necessary for
                  a step known as sigma point placement, causing it to
                  perform poorly in nonlinear problems. We show how to
                  treat sigma point placement in a UKF as a learning
                  problem in a model based view.  We demonstrate that
                  learning to place the sigma points correctly from
                  data can make sigma point collapse much less
                  likely. Learning can result in a significant
                  increase in predictive performance over default
                  settings of the parameters in the UKF and other
                  filters designed to avoid the problems of the UKF,
                  such as the GP-ADF. At the same time, we maintain a
                  lower computational complexity than the other
                  methods. We call our method UKF-L.},
  url =		 {.}
}

@inproceedings{TurSaaRas09,
  cat =		 {time},
  author =	 {Ryan Turner and Yunus Saat\c{c}i and Carl Edward
                  Rasmussen},
  title =	 {Adaptive Sequential {B}ayesian Change Point
                  Detection},
  booktitle =	 {NIPS Workshop on Temporal Segmentation},
  year =	 2009,
  editor =	 {Za\"{i}d Harchaoui},
  address =	 {Whistler, BC, Canada},
  month =	 {December},
  abstract =	 {Real-world time series are often nonstationary with
                  respect to the parameters of some underlying
                  prediction model (UPM). Furthermore, it is often
                  desirable to adapt the UPM to incoming regime
                  changes as soon as possible, necessitating
                  sequential inference about change point locations. A
                  Bayesian algorithm for online change point detection
                  (BOCPD) has been introduced recently by Adams and
                  MacKay (2007).  In this algorithm, uncertainty about
                  the last change point location is updated
                  sequentially, and is integrated out to make online
                  predictions robust to parameter changes. BOCPD
                  requires a set of fixed hyper-parameters which allow
                  the user to fully specify the hazard function for
                  change points and the prior distribution over the
                  parameters of the UPM.  In practice, finding the
                  "right" hyper-parameters can be quite difficult. We
                  therefore extend BOCPD by introducing
                  hyper-parameter learning, without sacrificing the
                  online nature of the algorithm.  Hyper-parameter
                  learning is performed by optimizing the marginal
                  likelihood of the BOCPD model, a closed-form
                  quantity which can be computed sequentially. We
                  illustrate performance on three real-world
                  datasets.},
  url =		 {.},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/rdturner/BOCPDposter.pdf">poster</a>,
                  <a
                  href="http://mlg.eng.cam.ac.uk/rdturner/BOCPDtalk.pdf">slides</a>.}
}

@inproceedings{VanSaaTehGha08,
  cat =		 {np mcmc time},
  abstract =	 {The infinite hidden Markov model is a non-parametric
                  extension of the widely used hidden Markov
                  model. Our paper introduces a new inference
                  algorithm for the infinite Hidden Markov model
                  called beam sampling. Beam sampling combines slice
                  sampling, which limits the number of states
                  considered at each time step to a finite number,
                  with dynamic programming, which samples whole state
                  trajectories efficiently. Our algorithm typically
                  outperforms the Gibbs sampler and is more robust. We
                  present applications of iHMM inference using the
                  beam sampler on changepoint detection and text
                  prediction problems.},
  address =	 {Helsinki, Finland},
  author =	 {Jurgen {Van Gael} and Yunus Saat\c{c}i and Yee-Whye
                  Teh and Zoubin Ghahramani},
  booktitle =	 icml25,
  pages =	 {1088--1095},
  publisher =	 {ACM},
  title =	 {Beam sampling for the infinite hidden {M}arkov
                  model},
  url =		 {.},
  volume =	 25,
  year =	 2008
}

@inproceedings{VanTehGha08,
  cat =		 {np time},
  author =	 {{Van Gael}, J. and Teh, Y.W. and Ghahramani, Z.},
  booktitle =	 nips21,
  editor =	 {Koller, D. and Schuurmans, D. and Bottou, L. and
                  Bengio, Y.},
  pages =	 {1697--1704},
  address =	 {Cambridge, MA, USA},
  publisher =	 mit,
  title =	 {The infinite factorial hidden {M}arkov model},
  url =		 {.},
  volume =	 21,
  year =	 2008,
  month =	 {December},
  abstract =	 {The infinite factorial hidden Markov model is a
                  non-parametric extension of the factorial hidden
                  Markov model. Our model defines a probability
                  distribution over an infinite number of independent
                  binary hidden Markov chains which together produce
                  an observable sequence of random variables. Central
                  to our model is a new type of non-parametric prior
                  distribution inspired by the Indian Buffet Process
                  which we call the \emph{Indian Buffet Markov
                  Process}.}
}

@inproceedings{VanVlaGha09,
  cat =		 {np},
  author =	 {{Van Gael}, J. and Vlachos, A. and Ghahramani, Z.},
  booktitle =	 {Proceedings of the 2009 Conference on Empirical
                  Methods in Natural Language Processing (EMNLP)},
  title =	 {The infinite {HMM} for unsupervised {PoS} tagging},
  pages =	 {678--687},
  isbn =	 {978-1-932432-62-6},
  url =		 {.},
  year =	 2009,
  month =	 {August},
  publisher =	 {Association for Computational Linguistics},
  address =	 {Singapore},
  abstract =	 {We extend previous work on fully unsupervised
                  part-of-speech tagging. Using a non-parametric
                  version of the HMM, called the infinite HMM (iHMM),
                  we address the problem of choosing the number of
                  hidden states in unsupervised Markov models for PoS
                  tagging. We experiment with two non-parametric
                  priors, the Dirichlet and Pitman-Yor processes, on
                  the Wall Street Journal dataset using a parallelized
                  implementation of an iHMM inference algorithm. We
                  evaluate the results with a variety of clustering
                  evaluation metrics and achieve equivalent or better
                  performances than previously reported. Building on
                  this promising result we evaluate the output of the
                  unsupervised PoS tagger as a direct replacement for
                  the output of a fully supervised PoS tagger for the
                  task of shallow parsing and compare the two
                  evaluations.}
}

@inproceedings{VanZhu07,
  author =	 {{Van Gael}, J. and Zhu, X.},
  booktitle =	 {International Joint Conference on Artificial
                  Intelligence (IJCAI)},
  title =	 {Correlation clustering for crosslingual link
                  detection},
  url =		 {.},
  pages =	 {1744--1749},
  year =	 2007,
  month =	 {January},
  address =	 {Hyderabad, India},
  editor =	 {Manuela M. Veloso},
  abstract =	 {The crosslingual link detection problem calls for
                  identifying news articles in multiple languages that
                  report on the same news event. This paper presents a
                  novel approach based on constrained clustering.  We
                  discuss a general way for constrained clustering
                  using a recent, graph-based clustering framework
                  called correlation clustering. We introduce a
                  correlation clustering implementation that features
                  linear program chunking to allow processing larger
                  datasets. We show how to apply the correlation
                  clustering algorithm to the crosslingual link
                  detection problem and present experimental results
                  that show correlation clustering improves upon the
                  hierarchical clustering approaches commonly used in
                  link detection, and, hierarchical clustering
                  approaches that take constraints into account.}
}

@inproceedings{VlaGhaBri10,
  cat =		 {clust},
  author =	 {Andreas Vlachos and Zoubin Ghahramani and Ted
                  Briscoe},
  year =	 2010,
  title =	 {Active Learning for Constrained {Dirichlet} Process
                  Mixture Models},
  booktitle =	 {Proceedings of the 2010 Workshop on Geometrical
                  Models of Natural Language Semantics},
  pages =	 {57--61},
  address =	 {Uppsala, Sweden},
  url =		 {.},
  abstract =	 {Recent work applied Dirichlet Process Mixture Models
                  to the task of verb clustering, incorporating
                  supervision in the form of must-links and
                  cannot-links constraints between instances. In this
                  work, we introduce an active learning approach for
                  constraint selection employing uncertainty-based
                  sampling. We achieve substantial improvements over
                  random selection on two datasets.}
}

@inproceedings{VlaGhaKor08,
  cat =		 {clust},
  booktitle =	 {ICML Workshop on Prior Knowledge for Text and
                  Language Processing},
  title =	 {{D}irichlet process mixture models for verb
                  clustering},
  author =	 {A. Vlachos and Z. Ghahramani and A Korhonen},
  year =	 2008,
  month =	 {July},
  address =	 {Helsinki, Finland},
  pages =	 {43--48},
  editor =	 {Guillaume Bouchard and Hal {Daum\'{e} III} and Marc
                  Dymetman and Yee Whye Teh},
  url =		 {.},
  abstract =	 {In this work we apply Dirichlet Process Mixture
                  Models to a learning task in natural language
                  processing (NLP): lexical-semantic verb
                  clustering. We assess the performance on a dataset
                  based on Levin's (1993) verb classes using the
                  recently introduced V-measure metric. In, we present
                  a method to add human supervision to the model in
                  order to to influence the solution with respect to
                  some prior knowledge. The quantitative evaluation
                  performed highlights the benefits of the chosen
                  method compared to previously used clustering
                  approaches.}
}

@inproceedings{VlaKorGha09,
  cat =		 {clust np},
  booktitle =	 {4th Workshop on Statistical Machine Translation,
                  EACL '09},
  title =	 {Unsupervised and constrained {Dirichlet} process
                  mixture models for verb clustering},
  author =	 {A. Vlachos and A Korhonen and Z. Ghahramani},
  year =	 2009,
  month =	 {March},
  address =	 {Athens, Greece},
  url =		 {.},
  abstract =	 {In this work, we apply Dirichlet Process Mixture
                  Models (DPMMs) to a learning task in natural
                  language processing (NLP): lexical-semantic verb
                  clustering. We thoroughly evaluate a method of
                  guiding DPMMs towards a particular clustering
                  solution using pairwise constraints. The
                  quantitative and qualitative evaluation performed
                  highlights the benefits of both standard and
                  constrained DPMMs compared to previously used
                  approaches. In addition, it sheds light on the use
                  of evaluation measures and their practical
                  application.}
}

@inproceedings{WilGha08,
  author =	 {Sinead Williamson and Zoubin Ghahramani},
  year =	 2008,
  title =	 {Probabilistic Models for Data Combination in
                  Recommender Systems},
  booktitle =	 {Learning from Multiple Sources Workshop, NIPS
                  Conference},
  address =	 {Whistler Canada},
  url =		 {.}
}

@inproceedings{WilGha10,
  cat =		 {np gp time},
  author =	 {Andrew Gordon Wilson and Zoubin Ghahramani},
  title =	 {Copula Processes},
  booktitle =	 nips23,
  year =	 2010,
  abstract =	 {We define a copula process which describes the
                  dependencies between arbitrarily many random
                  variables independently of their marginal
                  distributions. As an example, we develop a
                  stochastic volatility model, Gaussian Copula Process
                  Volatility (GCPV), to predict the latent standard
                  deviations of a sequence of random variables. To
                  make predictions we use Bayesian inference, with the
                  Laplace approximation, and with Markov chain Monte
                  Carlo as an alternative. We find our model can
                  outperform GARCH on simulated and financial
                  data. And unlike GARCH, GCPV can easily handle
                  missing data, incorporate covariates other than
                  time, and model a rich class of covariance
                  structures.  },
  url =
                  {http://books.nips.cc/papers/files/nips23/NIPS2010_0784.pdf},
  annote =	 {<a
                  href="http://books.nips.cc/papers/files/nips23/NIPS2010_0784.extra.zip">Supplementary
                  Material</a>, <a
                  href="http://books.nips.cc/papers/files/nips23/NIPS2010_0784_slide.pdf">slides</a>.},
  note =	 {Spotlight}
}

@inproceedings{WilGha11,
  cat =		 {np gp time},
  author =	 {Andrew Gordon Wilson and Zoubin Ghahramani},
  title =	 {Generalised {W}ishart Processes},
  booktitle =	 uai27,
  year =	 2011,
  abstract =	 {We introduce a new stochastic process called the
                  generalised Wishart process (GWP). It is a
                  collection of positive semi-definite random matrices
                  indexed by any arbitrary input variable. We use this
                  process as a prior over dynamic (e.g. time varying)
                  covariance matrices. The GWP captures a diverse
                  class of covariance dynamics, naturally hanles
                  missing data, scales nicely with dimension, has
                  easily interpretable parameters, and can use input
                  variables that include covariates other than
                  time. We describe how to construct the GWP,
                  introduce general procedures for inference and
                  prediction, and show that it outperforms its main
                  competitor, multivariate GARCH, even on financial
                  data that especially suits GARCH.  },
  url =		 {http://uai.sis.pitt.edu/papers/11/p736-wilson.pdf},
  annote =	 {<a
                  href="http://mlg.eng.cam.ac.uk/andrew/gwpsupp.pdf">Supplementary
                  Material</a>, Best Student Paper Award}
}

@techreport{WilKnoGha11,
  cat =		 {np gp time},
  author =	 {Andrew Gordon Wilson and David A Knowles and Zoubin
                  Ghahramani},
  title =	 {Gaussian Process Regression Networks},
  number =	 {arXiv:1110.4411 [stat.ML]},
  institution =	 {Department of Engineering, University of Cambridge},
  address =	 {Cambridge, UK},
  abstract =	 {We introduce a new regression framework, Gaussian
                  process regression networks (GPRN), which combines
                  the structural properties of Bayesian neural
                  networks with the non-parametric flexibility of
                  Gaussian processes.  This model accommodates input
                  dependent signal and noise correlations between
                  multiple response variables, input dependent
                  length-scales and amplitudes, and heavy-tailed
                  predictive distributions. We derive both efficient
                  Markov chain Monte Carlo and variational Bayes
                  inference procedures for this model. We apply GPRN
                  as a multiple output regression and multivariate
                  volatility model, demonstrating substantially
                  improved performance over eight popular multiple
                  output (multi-task) Gaussian process models and
                  three multivariate volatility models on benchmark
                  datasets, including a 1000 dimensional gene
                  expression dataset. },
  month =	 {October 19},
  year =	 2011,
  url =		 {.},
  annote =	 {arXiv:<a
                  href="http://arxiv.org/abs/1110.4411">1110.4411</a>}
}

@inproceedings{WilOrbGha10,
  cat =		 {np},
  author =	 {Sinead Williamson and Peter Orbanz and Zoubin
                  Ghahramani},
  title =	 {Dependent {I}ndian buffet processes},
  booktitle =	 aistats13,
  year =	 2010,
  volume =	 9,
  series =	 {W\&CP},
  address =	 {Chia Laguna, Sardinia, Italy},
  month =	 {May},
  pages =	 {924--931},
  url =		 {.},
  abstract =	 {Latent variable models represent hidden structure in
                  observational data. To account for the distribution
                  of the observational data changing over time, space
                  or some other covariate, we need generalizations of
                  latent variable models that explicitly capture this
                  dependency on the covariate. A variety of such
                  generalizations has been proposed for latent
                  variable models based on the Dirichlet process. We
                  address dependency on covariates in binary latent
                  feature models, by introducing a dependent Indian
                  Buffet Process. The model generates a binary random
                  matrix with an unbounded number of columns for each
                  value of the covariate. Evolution of the binary
                  matrices over the covariate set is controlled by a
                  hierarchical Gaussian process model. The choice of
                  covariance functions controls the dependence
                  structure and exchangeability properties of the
                  model. We derive a Markov Chain Monte Carlo sampling
                  algorithm for Bayesian inference, and provide
                  experiments on both synthetic and real-world
                  data. The experimental results show that explicit
                  modeling of dependencies significantly improves
                  accuracy of predictions.}
}

@inproceedings{WilRas96,
  cat =		 {gp},
  author =	 {Chris K.~I.~Williams and Carl Edward Rasmussen},
  title =	 {Gaussian processes for regression},
  booktitle =	 nips8,
  editors =	 {D.~S.~Touretzky and M.~C.~Mozer and M.~E.~Hasselmo},
  pages =	 {514--520},
  publisher =	 mit,
  address =	 {Cambridge, MA., USA},
  abstract =	 {The Bayesian analysis of neural networks is
                  difficult because a simple prior over weights
                  implies a complex prior over functions. We
                  investigate the use of a Gaussian process prior over
                  functions, which permits the predictive Bayesian
                  analysis for fixed values of hyperparameters to be
                  carried out exactly using matrix operations. Two
                  methods, using optimization and averaging (via
                  Hybrid Monte Carlo) over hyperparameters have been
                  tested on a number of challenging problems and have
                  produced excellent results.},
  year =	 1996,
  url =		 {.}
}

@techreport{WilRasSchTre02,
  cat =		 {gp},
  author =	 {Christopher K.~I.~Williams and Carl Edward Rasmussen
                  and Anton Schwaighofer and Volker Tresp},
  title =	 {Observations on the {N}ystr{\"o}m Method for
                  {G}aussian Process Prediction},
  url =		 {.},
  year =	 2002,
  institution =	 {University of Edinburgh},
  abstract =	 {A number of methods for speeding up Gaussian Process
                  (GP) prediction have been proposed, including the
                  Nystr{\"o}m method of Williams and Seeger (2001). In
                  this paper we focus on two issues (1) the
                  relationship of the Nystr{\"o}m method to the Subset
                  of Regressors method (Poggio and Girosi 1990; Luo
                  and Wahba, 1997) and (2) understanding in what
                  circumstances the Nystr{\"o}m approximation would be
                  expected to provide a good approximation to exact GP
                  regression.}
}

@inproceedings{WilWanHelBle10,
  cat =		 {np},
  author =	 {Sinead Williamson and Katherine A. Heller and
                  C. Wang and D. M. Blei},
  title =	 {The {IBP} compound {D}irichlet process and its
                  application to focused topic modeling},
  booktitle =	 icml27,
  pages =	 {1151--1158},
  year =	 2010,
  month =	 {June},
  address =	 {Haifa, Israel},
  url =		 {.},
  abstract =	 {The hierarchical Dirichlet process (HDP) is a
                  Bayesian nonparametric mixed membership model ---
                  each data point is modeled with a collection of
                  components of different proportions. Though
                  powerful, the HDP makes an assumption that the
                  probability of a component being exhibited by a data
                  point is positively correlated with its proportion
                  within that data point. This might be an undesirable
                  assumption. For example, in topic modeling, a topic
                  (component) might be rare throughout the corpus but
                  dominant within those documents (data points) where
                  it occurs. We develop the IBP compound Dirichlet
                  process (ICD), a Bayesian nonparametric prior that
                  decouples across-data prevalence and within-data
                  proportion in a mixed membership model. The ICD
                  combines properties from the HDP and the Indian
                  buffet process (IBP), a Bayesian nonparametric prior
                  on binary matrices. The ICD assigns a subset of the
                  shared mixture components to each data point. This
                  subset, the data point's "focus", is determined
                  independently from the amount that each of its
                  components contribute. We develop an ICD mixture
                  model for text, the focused topic model (FTM), and
                  show superior performance over the HDP-based topic
                  model.}
}

@inproceedings{XuHelGha09,
  cat =		 {np approx},
  volume =	 5,
  author =	 {Yang Xu and Katherine A. Heller and Zoubin
                  Ghahramani},
  note =	 {ISSN 1938-7228},
  booktitle =	 aistats12,
  editor =	 {D.~van Dyk and M.~Welling},
  title =	 {Tree-based inference for {D}irichlet process
                  mixtures},
  publisher =	 {Microtome Publishing (paper), Journal of Machine
                  Learning Research (online)},
  year =	 2009,
  month =	 {April},
  address =	 {Clearwater Beach, FL, USA},
  pages =	 {623--630},
  url =		 {.},
  abstract =	 {The Dirichlet process mixture (DPM) is a widely used
                  model for clustering and for general nonparametric
                  Bayesian density estimation. Unfortunately, like in
                  many statistical models, exact inference in a DPM is
                  intractable, and approximate methods are needed to
                  perform efficient inference.  While most attention
                  in the literature has been placed on Markov chain
                  Monte Carlo (MCMC) [1, 2, 3], variational Bayesian
                  (VB) [4] and collapsed variational methods [5], [6]
                  recently introduced a novel class of approximation
                  for DPMs based on Bayesian hierarchical clustering
                  (BHC). These tree-based combinatorial approximations
                  efficiently sum over exponentially many ways of
                  partitioning the data and offer a novel lower bound
                  on the marginal likelihood of the DPM [6]. In this
                  paper we make the following contributions: (1) We
                  show empirically that the BHC lower bounds are
                  substantially tighter than the bounds given by VB
                  [4] and by collapsed variational methods [5] on
                  synthetic and real datasets. (2) We also show that
                  BHC offers a more accurate predictive performance on
                  these datasets. (3) We further improve the
                  tree-based lower bounds with an algorithm that
                  efficiently sums contributions from alternative
                  trees. (4) We present a fast approximate method for
                  BHC. Our results suggest that our combinatorial
                  approximate inference methods and lower bounds may
                  be useful not only in DPMs but in other models as
                  well.}
}

@article{YuCunSanetal09a,
  cat =		 {gp},
  author =	 {B. M. Yu and J. P. Cunningham and G. Santhanam and
                  S. I. Ryu and K. V. Shenoy and M. Sahani},
  title =	 {{G}aussian-process factor analysis for
                  low-dimensional single-trial analysis of neural
                  population activity},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/YuJNP2009.pdf},
  journal =	 {Journal of Neurophysiology},
  volume =	 102,
  pages =	 {614-635},
  year =	 2009,
  abstract =	 {We consider the problem of extracting smooth,
                  low-dimensional neural trajectories that summarize
                  the activity recorded simultaneously from many
                  neurons on individual experimental trials. Beyond
                  the benefit of visualizing the high-dimensional,
                  noisy spiking activity in a compact form, such
                  trajectories can offer insight into the dynamics of
                  the neural circuitry underlying the recorded
                  activity. Current methods for extracting neural
                  trajectories involve a two-stage process: the spike
                  trains are first smoothed over time, then a static
                  dimensionality- reduction technique is applied. We
                  first describe extensions of the two-stage methods
                  that allow the degree of smoothing to be chosen in a
                  principled way and that account for spiking
                  variability, which may vary both across neurons and
                  across time. We then present a novel method for
                  extracting neural trajectories -- Gaussian-process
                  factor analysis (GPFA) -- which unifies the
                  smoothing and dimensionality- reduction operations
                  in a common probabilistic framework. We applied
                  these methods to the activity of 61 neurons recorded
                  simultaneously in macaque premotor and motor
                  cortices during reach planning and execution. By
                  adopting a goodness-of-fit metric that measures how
                  well the activity of each neuron can be predicted by
                  all other recorded neurons, we found that the
                  proposed extensions improved the predictive ability
                  of the two-stage methods. The predictive ability was
                  further improved by going to GPFA. From the
                  extracted trajectories, we directly observed a
                  convergence in neural state during motor planning,
                  an effect that was shown indirectly by previous
                  studies. We then show how such methods can be a
                  powerful tool for relating the spiking activity
                  across a neural population to the subject's behavior
                  on a single-trial basis. Finally, to assess how well
                  the proposed methods characterize neural population
                  activity when the underlying time course is known,
                  we performed simulations that revealed that GPFA
                  performed tens of percent better than the best
                  two-stage method.}
}

@inproceedings{YuCunSanetal09b,
  cat =		 {gp},
  booktitle =	 nips21,
  author =	 {B. M. Yu and J. P. Cunningham and G. Santhanam and
                  S. I. Ryu and K. V. Shenoy and M. Sahani},
  title =	 {{G}aussian-process factor analysis for
                  low-dimensional single-trial analysis of neural
                  population activity},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/YuNIPS2009.pdf},
  year =	 2009,
  address =	 {Vancouver, BC},
  month =	 {December},
  pages =	 {1--8},
  abstract =	 {We consider the problem of extracting smooth,
                  low-dimensional neural trajectories that summarize
                  the activity recorded simultaneously from many
                  neurons on individual experimental trials. Beyond
                  the benefit of visualizing the high-dimensional,
                  noisy spiking activity in a compact form, such
                  trajectories can offer insight into the dynamics of
                  the neural circuitry underlying the recorded
                  activity. Current methods for extracting neural
                  trajectories involve a two-stage process: the spike
                  trains are first smoothed over time, then a static
                  dimensionality- reduction technique is applied. We
                  first describe extensions of the two-stage methods
                  that allow the degree of smoothing to be chosen in a
                  principled way and that account for spiking
                  variability, which may vary both across neurons and
                  across time. We then present a novel method for
                  extracting neural trajectories -- Gaussian-process
                  factor analysis (GPFA) -- which unifies the
                  smoothing and dimensionality- reduction operations
                  in a common probabilistic framework. We applied
                  these methods to the activity of 61 neurons recorded
                  simultaneously in macaque premotor and motor
                  cortices during reach planning and execution. By
                  adopting a goodness-of-fit metric that measures how
                  well the activity of each neuron can be predicted by
                  all other recorded neurons, we found that the
                  proposed extensions improved the predictive ability
                  of the two-stage methods. The predictive ability was
                  further improved by going to GPFA. From the
                  extracted trajectories, we directly observed a
                  convergence in neural state during motor planning,
                  an effect that was shown indirectly by previous
                  studies. We then show how such methods can be a
                  powerful tool for relating the spiking activity
                  across a neural population to the subject's behavior
                  on a single-trial basis. Finally, to assess how well
                  the proposed methods characterize neural population
                  activity when the underlying time course is known,
                  we performed simulations that revealed that GPFA
                  performed tens of percent better than the best
                  two-stage method.}
}

@article{ZhaBatCunetal11,
  cat =		 {time},
  author =	 {M. Zhao and A. P. Batista and J. P. Cunningham and
                  C. A. Chestek and Z. Rivera-Alvidrez and R. Kalmar
                  and S. I. Ryu and K. V. Shenoy and S. Iyengar},
  title =	 {An {L}1-regularized logistic model for detecting
                  short-term neuronal interactions.},
  url =
                  {http://mlg.eng.cam.ac.uk/john/pubs/pdf/ZhaoJCNS2011.pdf},
  doi =		 {10.1007/s10827-011-0365-5},
  journal =	 {Journal of Computational Neuroscience},
  note =	 {In Press.},
  year =	 2011,
  abstract =	 {Interactions among neurons are a key com- ponent of
                  neural signal processing. Rich neural data sets
                  potentially containing evidence of interactions can
                  now be collected readily in the laboratory, but
                  existing analysis methods are often not sufficiently
                  sensitive and specific to reveal these
                  interactions. Generalized linear models offer a
                  platform for analyzing multi-electrode recordings of
                  neuronal spike train data. Here we suggest an
                  L1-regularized logistic regression model (L1L
                  method) to detect short-term (order of 3ms) neuronal
                  interactions. We estimate the parameters in this
                  model using a coordinate descent algorithm, and
                  determine the optimal tuning parameter using a
                  Bayesian Information Criterion. Simulation studies
                  show that in general the L1L method has better
                  sensitivities and specificities than those of the
                  traditional shuffle-corrected cross-correlogram
                  (covariogram) method. The L1L method is able to
                  detect excitatory interactions with both high
                  sensitivity and specificity with reasonably large
                  recordings, even when the magnitude of the
                  interactions is small; similar results hold for
                  inhibition given sufficiently high baseline firing
                  rates. Our study also suggests that the false
                  positives can be further removed by thresholding,
                  because their magnitudes are typically smaller than
                  true interactions. Simulations also show that the
                  L1L method is somewhat robust to partially observed
                  networks. We apply the method to multi-electrode
                  recordings collected in the monkey dorsal premotor
                  cortex (PMd) while the animal prepares to make
                  reaching arm movements. The results show that some
                  neurons interact differently depending on task
                  conditions. The stronger interactions detected with
                  our L1L method were also visible using the
                  covariogram method.}
}

@article{ZhaGhaYan08,
  cat =		 {gm},
  volume =	 73,
  number =	 3,
  month =	 {December},
  author =	 {J. Zhang and Z. Ghahramani and Y. Yang},
  title =	 {Flexible latent variable models for multi-task
                  learning},
  publisher =	 {Springer Netherlands},
  year =	 2008,
  journal =	 {Machine Learning},
  pages =	 {221--242},
  url =		 {.},
  abstract =	 {Given multiple prediction problems such as
                  regression and classification, we are interested in
                  a joint inference framework which can effectively
                  borrow information among tasks to improve the
                  prediction accuracy, especially when the number of
                  training examples per problem is small. In this
                  paper we propose a probabilistic framework which can
                  support a set of latent variable models for
                  different multi-task learning scenarios. We show
                  that the framework is a generalization of standard
                  learning methods for single prediction problems and
                  it can effectively model the shared structure among
                  different prediction tasks. Furthermore, we present
                  efficient algorithms for the empirical Bayes method
                  as well as point estimation. Our experiments on both
                  simulated datasets and real world classification
                  datasets show the effectiveness of the proposed
                  models in two evaluation settings: standard
                  multi-task learning setting and transfer learning
                  setting.}
}

