Mixture Density Networks: Probabilistic Regression for Uncertainty Estimation

Pingback: I am Iron Exoplanet: Meet GJ 367b, a dense sub-Earth | astrobites

Pingback: Soy un exoplaneta de hierro: Conoce a GH 367b, una sub-Tierra densa. | Astrobites en español

2022-03-13T23:10:47+05:30

Hi,
I don’t understand something in the “Avoiding Numerical Underflow” chapter. You take the negative log of p(x) as

-log p(x) = -ln(sigma) – 0.5*ln(2*pi) – 0.5*((x-mu)/sigma)^2

But shouldn’t this be “log p(x)” and not “- log p(x)” (note the minus)?

LikeLike

Reply

2022-10-07T16:56:04+05:30

Hi Manu,
first of all – great post! Thanks for your effort in writing and posting it!
I recently read some literature about MDNs, and there is one thing that I just don’t understand:
Looking at the neural network architecture in the figure labeled “Mixture Density Network: The output of a neural network parametrizes a Gaussian mixture model. Source[2]”, we see the parameters of the mixture model, i.e., the output layer’s nodes (i.e., the mixing coefficients, means and variances), as part of the neural network.
My general understanding would be that for training a neural network, the training data would in that case need to be a number of points as input (“X”, corresponding to the number of input nodes), and a number of points as output (“Y”, corresponding to the number of output nodes, i.e., number of mixtures times three [mean, variance, and mixing coefficient for each mode]). In this way, we would learn the weights to map the input to the output.
However, the training seems to be performed with single data points in the output. In other words, the “Y” of the training data does NOT consist of mean, variance, and mixing coefficient for each mode, but of a single number. Is that correct?
So how does the network learn these output parameters (mean, variance, and mixing coefficient for each mode) of the network without corresponding training data? Are they simply treated as hidden parameters to be learned just like the weights and biases?
I understand that these output parameters appear in the loss function, but I just do not see how exactly they are learned in “standard backpropagation”.
Is there a good interpretation of that?
Thank you very much!
Markus

LikeLike

Reply

2022-11-07T12:02:08+05:30

We are using the likelihood to train MDNs. So using the parameters predicted by the network, we calculate the likelihood that the “y” is from a distribution that is specified by the parameters.. and during training we minimize this likelihood.. and this minimization objective drives the backprop…

LikeLike

Reply

	#Sample Implementation for educational purposes
	#For full implementation check out https://github.com/manujosephv/pytorch_tabular

	ONEOVERSQRT2PI = 1.0 / math.sqrt(2 * math.pi)
	LOG2PI = math.log(2 * math.pi)

	class MixtureDensityHead(nn.Module):
	def __init__(self, config: DictConfig, **kwargs):
	self.hparams = config
	super().__init__()
	self._build_network()

	def _build_network(self):
	self.pi = nn.Linear(self.hparams.input_dim, self.hparams.num_gaussian)
	nn.init.normal_(self.pi.weight)
	self.sigma = nn.Linear(
	self.hparams.input_dim,
	self.hparams.num_gaussian,
	bias=self.hparams.sigma_bias_flag,
	)
	self.mu = nn.Linear(self.hparams.input_dim, self.hparams.num_gaussian)
	nn.init.normal_(self.mu.weight)
	if self.hparams.mu_bias_init is not None:
	for i, bias in enumerate(self.hparams.mu_bias_init):
	nn.init.constant_(self.mu.bias[i], bias)

	def forward(self, x):
	pi = self.pi(x)
	sigma = self.sigma(x)
	# Applying modified ELU activation
	sigma = nn.ELU()(sigma) + 1 + 1e-15
	mu = self.mu(x)
	return pi, sigma, mu

	def gaussian_probability(self, sigma, mu, target, log=False):
	"""Returns the probability of `target` given MoG parameters `sigma` and `mu`.

	Arguments:
	sigma (BxGxO): The standard deviation of the Gaussians. B is the batch
	size, G is the number of Gaussians, and O is the number of
	dimensions per Gaussian.
	mu (BxGxO): The means of the Gaussians. B is the batch size, G is the
	number of Gaussians, and O is the number of dimensions per Gaussian.
	target (BxI): A batch of target. B is the batch size and I is the number of
	input dimensions.
	Returns:
	probabilities (BxG): The probability of each point in the probability
	of the distribution in the corresponding sigma/mu index.
	"""
	target = target.expand_as(sigma)
	if log:
	ret = (
	-torch.log(sigma)
	– 0.5 * LOG2PI
	– 0.5 * torch.pow((target – mu) / sigma, 2)
	)
	else:
	ret = (ONEOVERSQRT2PI / sigma) * torch.exp(
	-0.5 * ((target – mu) / sigma) ** 2
	)
	return ret # torch.prod(ret, 2)

	def log_prob(self, pi, sigma, mu, y):
	log_component_prob = self.gaussian_probability(sigma, mu, y, log=True)
	log_mix_prob = torch.log(
	nn.functional.gumbel_softmax(pi, tau=1, dim=-1) + 1e-15
	)
	return torch.logsumexp(log_component_prob + log_mix_prob, dim=-1)

	def sample(self, pi, sigma, mu):
	"""Draw samples from a MoG."""
	categorical = Categorical(pi)
	pis = categorical.sample().unsqueeze(1)
	sample = Variable(sigma.data.new(sigma.size(0), 1).normal_())
	# Gathering from the n Gaussian Distribution based on sampled indices
	sample = sample * sigma.gather(1, pis) + mu.gather(1, pis)
	return sample

	def generate_samples(self, pi, sigma, mu, n_samples=None):
	if n_samples is None:
	n_samples = self.hparams.n_samples
	samples = []
	softmax_pi = nn.functional.gumbel_softmax(pi, tau=1, dim=-1)
	assert (
	softmax_pi < 0
	).sum().item() == 0, "pi parameter should not have negative"
	for _ in range(n_samples):
	samples.append(self.sample(softmax_pi, sigma, mu))
	samples = torch.cat(samples, dim=1)
	return samples

	def generate_point_predictions(self, pi, sigma, mu, n_samples=None):
	# Sample using n_samples and take average
	samples = self.generate_samples(pi, sigma, mu, n_samples)
	if self.hparams.central_tendency == "mean":
	y_hat = torch.mean(samples, dim=-1)
	elif self.hparams.central_tendency == "median":
	y_hat = torch.median(samples, dim=-1).values
	return y_hat

	#Sample Implementation for educational purposes
	#For full implementation check out https://github.com/manujosephv/pytorch_tabular

	class BaseMDN(BaseModel):
	def __init__(self, config: DictConfig, **kwargs):
	super().__init__(config, **kwargs)

	@abstractmethod
	def unpack_input(self, x: Dict):
	pass

	def forward(self, x: Dict):
	x = self.unpack_input(x)
	x = self.backbone(x)
	pi, sigma, mu = self.mdn(x)
	return {"pi": pi, "sigma": sigma, "mu": mu, "backbone_features": x}

	def sample(self, x: Dict, n_samples: Optional[int] = None, ret_model_output = False):
	ret_value = self.forward(x)
	samples= self.mdn.generate_samples(
	ret_value["pi"], ret_value["sigma"], ret_value["mu"], n_samples
	)
	if ret_model_output:
	return samples, ret_value
	else:
	return samples

	def calculate_loss(self, y, pi, sigma, mu, tag="train"):
	# NLL Loss
	log_prob = self.mdn.log_prob(pi, sigma, mu, y)
	loss = torch.mean(-log_prob)
	if self.hparams.mdn_config.weight_regularization is not None:
	sigma_l1_reg = 0
	pi_l1_reg = 0
	mu_l1_reg = 0
	if self.hparams.mdn_config.lambda_sigma > 0:
	# Weight Regularization Sigma
	sigma_params = torch.cat(
	[x.view(-1) for x in self.mdn.sigma.parameters()]
	)
	sigma_l1_reg = self.hparams.mdn_config.lambda_sigma * torch.norm(
	sigma_params, self.hparams.mdn_config.weight_regularization
	)
	if self.hparams.mdn_config.lambda_pi > 0:
	pi_params = torch.cat([x.view(-1) for x in self.mdn.pi.parameters()])
	pi_l1_reg = self.hparams.mdn_config.lambda_sigma * torch.norm(
	pi_params, self.hparams.mdn_config.weight_regularization
	)
	if self.hparams.mdn_config.lambda_mu > 0:
	mu_params = torch.cat([x.view(-1) for x in self.mdn.mu.parameters()])
	mu_l1_reg = self.hparams.mdn_config.lambda_mu * torch.norm(
	mu_params, self.hparams.mdn_config.weight_regularization
	)

	loss = loss + sigma_l1_reg + pi_l1_reg + mu_l1_reg
	self.log(
	f"{tag}_loss",
	loss,
	on_epoch=(tag == "valid"),
	on_step=(tag == "train"),
	# on_step=False,
	logger=True,
	prog_bar=True,
	)
	return loss

Mixture Density Networks: Probabilistic Regression for Uncertainty Estimation

Types of Uncertainty

The Core Idea