The problem of designing cost functions to estimate a posteriori probabilities in multiclass problems is addressed in this paper. We establish necessary and sufficient conditions that these costs must satisfy in one-class one-output networks whose outputs are consistent with probability laws. We focus our attention on a particular subset of the corresponding cost functions; those which verify two usually interesting properties: symmetry and separability (well-known cost functions, such as the quadratic cost or the cross entropy are particular cases in this subset). Finally, we present a universal stochastic gradient learning rule for single-layer networks, in the sense of minimizing a general version of these cost functions for a wide family of nonlinear activation functions.

}, keywords = {Cost functions, Estimation, Functions, Learning algorithms, Multiclass problems, Neural networks, Pattern recognition, Probability, Problem solving, Random processes, Stochastic gradient learning rule}, issn = {10459227}, doi = {10.1109/72.761724}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-0032643080\&partnerID=40\&md5=d528195bd6ec84531e59ddd2ececcd46}, author = {Jes{\'u}s Cid-Sueiro and J I Arribas and S Urban-Munoz and A R Figueiras-Vidal} } @conference {412, title = {Estimates of constrained multi-class a posteriori probabilities in time series problems with neural networks}, booktitle = {Proceedings of the International Joint Conference on Neural Networks}, year = {1999}, publisher = {IEEE, United States}, organization = {IEEE, United States}, address = {Washington, DC, USA}, abstract = {In time series problems, where time ordering is a crucial issue, the use of Partial Likelihood Estimation (PLE) represents a specially suitable method for the estimation of parameters in the model. We propose a new general supervised neural network algorithm, Joint Network and Data Density Estimation (JNDDE), that employs PLE to approximate conditional probability density functions for multi-class classification problems. The logistic regression analysis is generalized to multiple class problems with softmax regression neural network used to model the a-posteriori probabilities such that they are approximated by the network outputs. Constraints to the network architecture, as well as to the model of data, are imposed, resulting in both a flexible network architecture and distribution modeling. We consider application of JNDDE to channel equalization and present simulation results.

}, keywords = {Approximation theory, Computer simulation, Constraint theory, Data structures, Joint network-data density estimation (JNDDE), Mathematical models, Multi-class a posteriori probabilities, Neural networks, Partial likelihood estimation (PLE), Probability density function, Regression analysis}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-0033325263\&partnerID=40\&md5=8c6134020b0b2a9c5ab05b131c070b88}, author = {J I Arribas and Jes{\'u}s Cid-Sueiro and T Adali and H Ni and B Wang and A R Figueiras-Vidal} } @conference {411, title = {Neural architectures for parametric estimation of a posteriori probabilities by constrained conditional density functions}, booktitle = {Neural Networks for Signal Processing - Proceedings of the IEEE Workshop}, year = {1999}, publisher = {IEEE, Piscataway, NJ, United States}, organization = {IEEE, Piscataway, NJ, United States}, address = {Madison, WI, USA}, abstract = {A new approach to the estimation of {\textquoteright}a posteriori{\textquoteright} class probabilities using neural networks, the Joint Network and Data Density Estimation (JNDDE), is presented in this paper. It is based on the estimation of the conditional data density functions, with some restrictions imposed by the classifier structure; the Bayes{\textquoteright} rule is used to obtain the {\textquoteright}a posteriori{\textquoteright} probabilities from these densities. The proposed method is applied to three different network structures: the logistic perceptron (for the binary case), the softmax perceptron (for multi-class problems) and a generalized softmax perceptron (that can be used to map arbitrarily complex probability functions). Gaussian mixture models are used for the conditional densities. The method has the advantage of establishing a distinction between the network parameters and the model parameters. Complexity on any of them can be fixed as desired. Maximum Likelihood gradient-based rules for the estimation of the parameters can be obtained. It is shown that JNDDE exhibits a more robust convergence characteristics than other methods of a posteriori probability estimation, such as those based on the minimization of a Strict Sense Bayesian (SSB) cost function.

}, keywords = {Asymptotic stability, Constraint theory, Data structures, Gaussian mixture models, Joint network and data density estimation, Mathematical models, Maximum likelihood estimation, Neural networks, Probability}, doi = {https://doi.org/10.1109/NNSP.1999.788145}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-0033321049\&partnerID=40\&md5=7967fa377810cc0c3e6a4d9020024b80}, author = {J I Arribas and Jes{\'u}s Cid-Sueiro and T Adali and A R Figueiras-Vidal} } @conference {410, title = {Neural networks to estimate ML multi-class constrained conditional probability density functions}, booktitle = {Proceedings of the International Joint Conference on Neural Networks}, year = {1999}, publisher = {IEEE, United States}, organization = {IEEE, United States}, address = {Washington, DC, USA}, abstract = {In this paper, a new algorithm, the Joint Network and Data Density Estimation (JNDDE), is proposed to estimate the {\textquoteleft}a posteriori{\textquoteright} probabilities of the targets with neural networks in multiple classes problems. It is based on the estimation of conditional density functions for each class with some restrictions or constraints imposed by the classifier structure and the use Bayes rule to force the a posteriori probabilities at the output of the network, known here as a implicit set. The method is applied to train perceptrons by means of Gaussian mixture inputs, as a particular example for the Generalized Softmax Perceptron (GSP) network. The method has the advantage of providing a clear distinction between the network architecture and the model of the data constraints, giving network parameters or weights on one side and data over parameters on the other. MLE stochastic gradient based rules are obtained for JNDDE. This algorithm can be applied to hybrid labeled and unlabeled learning in a natural fashion.

}, keywords = {Generalized softmax perceptron (GSP) network, Joint network and data density estimation (JNDDE), Mathematical models, Maximum likelihood estimation, Neural networks, Probability density function, Random processes}, doi = {https://doi.org/10.1109/IJCNN.1999.831174}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-0033326060\&partnerID=40\&md5=bb38c144dac0872f3a467dc12170e6b6}, author = {J I Arribas and Jes{\'u}s Cid-Sueiro and T Adali and A R Figueiras-Vidal} }