recpool/display_recpool_net.lua at master · rolfe/recpool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
require 'image'
require 'gnuplot'

--local part_thresh, cat_thresh = 0.5, 0.7 -- FOR PAPER (MNIST)
--local part_thresh, cat_thresh = 0.7, 1.0 -- Unsupervised MNIST with entropy loss
--local part_thresh, cat_thresh = 0.45, 0.5 -- ENTROPY EXPERIMENTS
--local part_thresh, cat_thresh = 0.25, 0.3 -- CIFAR ENTROPY EXPERIMENTS
--local part_thresh, cat_thresh = 0.2, 0.3 --0.275 -- CIFAR ENTROPY EXPERIMENTS 8x8
--local part_thresh, cat_thresh = 0.39, 0.4 -- CIFAR ENTROPY EXPERIMENTS 12x12 with increased softmax scaling
--local part_thresh, cat_thresh = 0.35, 0.42 -- CIFAR ENTROPY EXPERIMENTS, after pretraining when a continuum exists, conservative definition of categorical-units
--local part_thresh, cat_thresh = 0.35, 0.351 -- CIFAR ENTROPY EXPERIMENTS, after pretraining when a continuum exists, liberal definition of categorical-units
--local part_thresh, cat_thresh = 0.3, 0.6 -- CIFAR ENTROPY EXPERIMENTS
local part_thresh, cat_thresh = 0.1, 0.12 -- CIFAR ENTROPY EXPERIMENTS, sparse coding pretraining only
--local part_thresh, cat_thresh = 0.2, 0.21 -- CIFAR ENTROPY EXPERIMENTS, sparse coding pretraining only
--local part_thresh, cat_thresh = 0.49, 0.5 -- MNIST, sparse coding pretraining only

local function plot_training_error(t)
   gnuplot.pngfigure(params.rundir .. '/error.png')
   gnuplot.plot(avTrainingError:narrow(1,1,math.max(t/params.textstatinterval,2)))
   gnuplot.title('Training Error')
   gnuplot.xlabel('# iterations / ' .. params.textstatinterval)
   gnuplot.ylabel('Cost')

   -- clean up plots
   gnuplot.plotflush()
   gnuplot.closeall()
end

-- symmetric == false -> the filters have already been scaled, and should be plotted as-is (or something like that; check!)
-- symmetric == true or nil -> scale colormap between -max and max, with gray = 0
function save_filter(current_filter, filter_name, log_directory, num_display_columns, symmetric)
   if symmetric == nil then symmetric = true end -- default value of symmetric is true
   num_display_columns = num_display_columns or 10
   local current_filter_side_length
   if false and (current_filter:size(1) % 3 == 0) then -- make sure that CIFAR input filters align the R, G, and B channels coherently
      current_filter_side_length = math.sqrt(current_filter:size(1)/3)
      --current_filter = current_filter:reshape(current_filter:size(2),3,32,32) -- reshape makes a copy of the entire filter, which seems unnecessarily inefficient
      -- after unfolding, the original dimension iterates across groups; the last dimension iterates within groups
      current_filter = current_filter:unfold(1,current_filter_side_length,current_filter_side_length):unfold(1,current_filter_side_length,current_filter_side_length):transpose(1,2) -- may still need to transpose the last two dimensions!!!
      --current_filter_side_length = math.sqrt(current_filter:size(1))
   else
      current_filter_side_length = math.sqrt(current_filter:size(1))
      current_filter = current_filter:unfold(1,current_filter_side_length, current_filter_side_length):transpose(1,2)
   end
   if symmetric then current_filter = current_filter:clone():mul(-1) end -- flip the current filter from white to black; make a copy so we don't risk corrupting the original data
   local current_image = image.toDisplayTensor{input=current_filter,padding=1,nrow=num_display_columns,symmetric=symmetric}

   -- ideally, the pdf viewer should refresh automatically.  This
   image.savePNG(paths.concat(log_directory, filter_name .. '.png'), current_image)
end

-- plot filters that are most, least, and of distributed values of the selected_measure
function plot_sorted_filters(encoding_filter, decoding_filter, sorted_indices, opt)
   local num_plain_sorted_filters = 60 -- 20 most categorical, 20 evenly distributed, 20 least categorical
   local selected_filters_decoding = torch.Tensor(num_plain_sorted_filters, decoding_filter:size(1))
   local selected_filters_encoding = torch.Tensor(num_plain_sorted_filters, decoding_filter:size(1))

   for i = 1,num_plain_sorted_filters/3 do
      selected_filters_decoding[{i, {}}]:copy(decoding_filter:select(2,sorted_indices[i]))
      selected_filters_encoding[{i, {}}]:copy(encoding_filter:select(1,sorted_indices[i]))
      local index_from_end_filters = decoding_filter:size(2) - i + 1
      local index_from_end_figure = num_plain_sorted_filters - i + 1
      selected_filters_decoding[{index_from_end_figure, {}}]:copy(decoding_filter:select(2,sorted_indices[index_from_end_filters]))
      selected_filters_encoding[{index_from_end_figure, {}}]:copy(encoding_filter:select(1,sorted_indices[index_from_end_filters]))
      local index_from_middle_filters = math.ceil(num_plain_sorted_filters/3 + (i/(1 + num_plain_sorted_filters/3)) * (decoding_filter:size(2) - 2*num_plain_sorted_filters/3))
      local index_from_middle_figure = num_plain_sorted_filters/3 + i
      selected_filters_decoding[{index_from_middle_figure, {}}]:copy(decoding_filter:select(2,sorted_indices[index_from_middle_filters]))
      selected_filters_encoding[{index_from_middle_figure, {}}]:copy(encoding_filter:select(1,sorted_indices[index_from_middle_filters]))
   end

   --save_filter(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight:t():narrow(2,90,40), 'selected_encoding_fe_dict', opt.log_directory, 20)
   save_filter(selected_filters_decoding:t(), 'sorted_decoding_fe_dict', opt.log_directory, 20)
   save_filter(selected_filters_encoding:t(), 'sorted_encoding_fe_dict', opt.log_directory, 20)
end


-- dataset is nExamples x input_dim
-- hidden_activation is nExamples x hidden_dim
-- construct a dictionary matrix that optimally reconstructs the data_set from the hidden_activation
-- odm stands for optimal dictionary matrix
-- output_matrix is of size hidden_dim x input_dim ; it is already restricted to the correct hidden unit
function construct_optimal_dictionary(data_set, hidden_activation, output_matrix)
   -- gels only works properly if hidden_activation is full rank, and is unstable if hidden_activation is ill-conditioned.  Remove any hidden units that do not have sufficient activation.
   local num_active_units = 0
   local activation_norms = torch.Tensor(hidden_activation:size(2)):zero()
   for i=1,hidden_activation:size(2) do
      activation_norms[i] = hidden_activation:select(2,i):norm()
      if hidden_activation:select(2,i):norm() > 0.05 then
	 num_active_units = num_active_units + 1
      end
   end
   print('found ' .. num_active_units .. ' active units')

   -- construct a reduced version of hidden activation, which only contains the active hidden units.  Use this to reconstruct the optimal dictionary
   local conservative_hidden_activation = torch.Tensor(hidden_activation:size(1), num_active_units)
   num_active_units = 0
   for i=1,hidden_activation:size(2) do
      if hidden_activation:select(2,i):norm() > 0.05 then
	 num_active_units = num_active_units + 1
	 conservative_hidden_activation:select(2,num_active_units):copy(hidden_activation:select(2,i))
      end
   end

   local conservative_optimal_dictionary_matrix = torch.gels(data_set, conservative_hidden_activation)

   -- save each optimal dictionary separately - for debug only, at this point
   local optimal_dictionary_matrix_slice = torch.Tensor(hidden_activation:size(2), data_set:size(2)):zero()

   num_active_units = 0
   for i=1,hidden_activation:size(2) do
      if hidden_activation:select(2,i):norm() > 0.05 then
	 num_active_units = num_active_units + 1
	 local selected_filter = conservative_optimal_dictionary_matrix:select(1,num_active_units)
	 --output_matrix:select(1,(i-1)*odm_stride + odm_offset):copy(selected_filter:div(selected_filter:norm())) -- this ignores the extra rows --:narrow(1,1,hidden_activation:size(2)))
	 -- select the desired row of the output matrix
	 output_matrix:select(1,i):copy(selected_filter):div(selected_filter:norm()) -- this ignores the extra rows --:narrow(1,1,hidden_activation:size(2)))
	 optimal_dictionary_matrix_slice:select(1,i):copy(conservative_optimal_dictionary_matrix:select(1,num_active_units))
      end
   end

   print('actual error is ' .. data_set:dist(hidden_activation*optimal_dictionary_matrix_slice))
   print('predicted error is ' .. math.sqrt(conservative_optimal_dictionary_matrix:narrow(1,conservative_hidden_activation:size(2)+1,
											  conservative_hidden_activation:size(1) - conservative_hidden_activation:size(2)):pow(2):sum()))
end

-- use in place of receptive_field_builder_factory to accumulate invariance information across epochs
-- for invariance statistics, use something like prob(h(x+1) > 0 | h(x) > 0) / (prob(h(x+1) > 0)) ; this is what Goodfellow et al. 2009 use, but with a threshold (non-zero) tuned so that each hidden unit is "active" a fixed percentage of the time.  We can separately accumulate the conditional and marginal probability that h(x+1) > 0, and then find the ratio at the end.  Plot this relative to the categoricalness of the unit.

function invariance_builder_factory(hidden_layer_size, max_num_shifts)
   local num_conditional_activation_invariance_shifts = 40

   local invariance_builder = {}
   -- p(h(x+1) > 0) = num_iters_activated / num_iters_processed
   -- p(h(x+1) > 0 | h(x) > 0) = num_iters_conditionally_activated / num_iters_condition_satisfied
   local num_iters_processed = 0
   local num_iters_activated = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x) > 0
   local num_iters_condition_satisfied = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x) > 0, not counting the last iter of a trajectory, since conditional activation is not possible
   local num_iters_conditionally_activated = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x+1) > 0 and (given that) h(x) > 0
   local num_iters_condition_satisfied_2 = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x) > 0, not counting the last iter of a trajectory, since conditional activation is not possible
   local num_iters_conditionally_activated_2 = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x+1) > 0 and (given that) h(x) > 0
   local num_iters_condition_satisfied_3 = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x) > 0, not counting the last iter of a trajectory, since conditional activation is not possible
   local num_iters_conditionally_activated_3 = torch.Tensor(hidden_layer_size):zero() -- num iters that h(x+1) > 0 and (given that) h(x) > 0

   local num_iters_condition_satisfied_all = torch.Tensor(num_conditional_activation_invariance_shifts, hidden_layer_size):zero()
   local num_iters_conditionally_activated_all = torch.Tensor(num_conditional_activation_invariance_shifts, hidden_layer_size):zero()
   local conditionally_activated_all = torch.Tensor()

   local shrink_sign = torch.Tensor()
   local shrink_sign_sum = torch.Tensor(1,hidden_layer_size)
   local conditionally_activated = torch.Tensor()
   local conditionally_activated_2 = torch.Tensor()
   local conditionally_activated_3 = torch.Tensor()

   local accumulated_distance_per_shift = torch.Tensor(max_num_shifts):zero()
   local accumulated_distance_per_shift_part = torch.Tensor(max_num_shifts):zero()
   local accumulated_distance_per_shift_cat = torch.Tensor(max_num_shifts):zero()
   local num_samples_per_shift = torch.Tensor(max_num_shifts):zero()
   local accumulated_magnitude, accumulated_magnitude_part, accumulated_magnitude_cat, num_samples_accumulated_magnitude = 0,0,0,0


   local angle_between_encoder_and_decoder, part_filter, cat_filter

   function invariance_builder:set_encoder_and_decoder(encoding_filter, decoding_filter)
      angle_between_encoder_and_decoder = torch.cdiv(torch.cmul(encoding_filter:t(), decoding_filter):sum(1):select(1,1),
						     torch.cmul(torch.pow(encoding_filter, 2):sum(2):select(2,1):sqrt(),
								torch.pow(decoding_filter, 2):sum(1):select(1,1):sqrt())):acos()
      part_filter = torch.Tensor(1,hidden_layer_size):copy(angle_between_encoder_and_decoder:le(part_thresh))
      cat_filter = torch.Tensor(1,hidden_layer_size):copy(angle_between_encoder_and_decoder:ge(cat_thresh))
      print('part_filter', part_filter)
      print('cat_filter', cat_filter)
   end

   function invariance_builder:reset_filters_based_on_invariance(part_upper_bound, cat_lower_bound) -- set part_filter and cat_filter based upon the invariance of the units
      local invariance = torch.cdiv(torch.cdiv(num_iters_conditionally_activated, num_iters_condition_satisfied),
				    torch.div(num_iters_activated, num_iters_processed))
      part_filter = torch.Tensor(1,hidden_layer_size):copy(invariance:le(part_upper_bound))
      cat_filter = torch.Tensor(1,hidden_layer_size):copy(invariance:ge(cat_lower_bound))
      print('part_filter', part_filter)
      print('cat_filter', cat_filter)

      num_iters_processed = 0
      num_iters_activated:zero()
      num_iters_condition_satisfied:zero()
      num_iters_conditionally_activated:zero()
      num_iters_condition_satisfied_2:zero()
      num_iters_conditionally_activated_2:zero()
      num_iters_condition_satisfied_3:zero()
      num_iters_conditionally_activated_3:zero()
   end

   function invariance_builder:accumulate_shrink_weighted_inputs(new_input, base_shrink, shrink_copies, new_target) -- name and arguments are fixed by receptive_field_builder for compatibility
      -- we can assume that the input is in minibatches, since this is the format in which the invariant trajectories are presented
      local final_shrink_output = shrink_copies[#shrink_copies].output
      local intermediate_shrink_output = shrink_copies[math.min(2, #shrink_copies)].output
      local selected_shrink_output = final_shrink_output --intermediate_shrink_output -- determines whether configuration invariance is calculated based upon the intermediate or final state
      --local selected_shrink_output = intermediate_shrink_output -- determines whether configuration invariance is calculated based upon the intermediate or final state

      local num_iters_batch = selected_shrink_output:size(1)
      num_iters_processed = num_iters_processed + num_iters_batch

      shrink_sign:resizeAs(selected_shrink_output):copy(selected_shrink_output):sign()
      -- looking at the invariance after requiring that the unit activities exceed some threshold greater than 0 doesn't seem to improve the correlation between categoricalness and invariance
      --shrink_sign:resizeAs(final_shrink_output):copy(final_shrink_output):zeroLtN(0.001)
      --shrink_sign:sign()
      conditionally_activated:resize(shrink_sign:size(1) - 1, shrink_sign:size(2)):copy(shrink_sign:narrow(1,2,num_iters_batch-1))
      conditionally_activated_2:resize(shrink_sign:size(1) - 2, shrink_sign:size(2)):copy(shrink_sign:narrow(1,3,num_iters_batch-2))
      conditionally_activated_3:resize(shrink_sign:size(1) - 3, shrink_sign:size(2)):copy(shrink_sign:narrow(1,4,num_iters_batch-3))
      conditionally_activated_all:resize(shrink_sign:size(1), shrink_sign:size(2)):zero()

      shrink_sign_sum:sum(shrink_sign, 1) -- calculate num iters that h(x) > 0
      num_iters_activated:add(shrink_sign_sum:select(1,1))

      -- shift-1 invariance
      local shrink_sign_restricted = shrink_sign:narrow(1,1,num_iters_batch - 1) -- calculate num iters that h(x) > 0, not counting the last iter
      shrink_sign_sum:sum(shrink_sign_restricted, 1)
      num_iters_condition_satisfied:add(shrink_sign_sum:select(1,1))

      conditionally_activated:zeroLtN2(shrink_sign_restricted, 0) -- sets h(x+1) = 0 if h(x) <= 0
      shrink_sign_sum:sum(conditionally_activated, 1)
      num_iters_conditionally_activated:add(shrink_sign_sum:select(1,1)) -- num iters that h(x+1) > 0 and h(x) > 0

      -- shift-2 invariance
      shrink_sign_restricted = shrink_sign:narrow(1,1,num_iters_batch - 2) -- calculate num iters that h(x) > 0, not counting the last iter
      shrink_sign_sum:sum(shrink_sign_restricted, 1)
      num_iters_condition_satisfied_2:add(shrink_sign_sum:select(1,1))

      conditionally_activated_2:zeroLtN2(shrink_sign_restricted, 0) -- sets h(x+1) = 0 if h(x) <= 0
      shrink_sign_sum:sum(conditionally_activated_2, 1)
      num_iters_conditionally_activated_2:add(shrink_sign_sum:select(1,1)) -- num iters that h(x+1) > 0 and h(x) > 0

      -- shift-3 invariance
      shrink_sign_restricted = shrink_sign:narrow(1,1,num_iters_batch - 3) -- calculate num iters that h(x) > 0, not counting the last iter
      shrink_sign_sum:sum(shrink_sign_restricted, 1)
      num_iters_condition_satisfied_3:add(shrink_sign_sum:select(1,1))

      conditionally_activated_3:zeroLtN2(shrink_sign_restricted, 0) -- sets h(x+1) = 0 if h(x) <= 0
      shrink_sign_sum:sum(conditionally_activated_3, 1)
      num_iters_conditionally_activated_3:add(shrink_sign_sum:select(1,1)) -- num iters that h(x+1) > 0 and h(x) > 0

      local conditionally_activated_i
      for i = 1,num_conditional_activation_invariance_shifts do
	 conditionally_activated_i = conditionally_activated_all:narrow(1, 1, shrink_sign:size(1) - i):copy(shrink_sign:narrow(1,i+1,num_iters_batch-i))

	 shrink_sign_restricted = shrink_sign:narrow(1,1,num_iters_batch - i) -- calculate num iters that h(x) > 0, not counting the last iter
	 shrink_sign_sum:sum(shrink_sign_restricted, 1)
	 num_iters_condition_satisfied_all:select(1,i):add(shrink_sign_sum:select(1,1))

	 conditionally_activated_i:zeroLtN2(shrink_sign_restricted, 0) -- sets h(x+1) = 0 if h(x) <= 0
	 shrink_sign_sum:sum(conditionally_activated_i, 1)
	 num_iters_conditionally_activated_all:select(1,i):add(shrink_sign_sum:select(1,1)) -- num iters that h(x+1) > 0 and h(x) > 0
      end


      if num_iters_batch ~= max_num_shifts then
	 error('number of shifts in this batch ' .. num_iters_batch .. ' does not match the expected number ' .. max_num_shifts)
      end
      for i = 0,max_num_shifts - 1 do
	 local narrowed_left = selected_shrink_output:narrow(1,1,max_num_shifts - i)
	 local narrowed_right = selected_shrink_output:narrow(1,1+i, max_num_shifts - i)
	 local diff_tensor = torch.add(narrowed_left, -1, narrowed_right):pow(2):sum(2):select(2,1):sqrt()
	 --local diff_tensor = torch.add(narrowed_left:clone():sign(), -1, narrowed_right:clone():sign()):abs():sign():sum(2):select(2,1)
	 accumulated_distance_per_shift[i+1] = accumulated_distance_per_shift[i+1] + diff_tensor:sum()
	 num_samples_per_shift[i+1] = num_samples_per_shift[i+1] + diff_tensor:size(1)

	 local diff_tensor_part = torch.add(narrowed_left, -1, narrowed_right):cmul(torch.expandAs(part_filter, narrowed_left)):pow(2):sum(2):select(2,1):sqrt()
	 --local diff_tensor_part = torch.add(narrowed_left:clone():sign(), -1, narrowed_right:clone():sign()):cmul(torch.expandAs(part_filter, narrowed_left)):abs():sign():sum(2):select(2,1)
	 accumulated_distance_per_shift_part[i+1] = accumulated_distance_per_shift_part[i+1] + diff_tensor_part:sum()
	 local diff_tensor_cat = torch.add(narrowed_left, -1, narrowed_right):cmul(torch.expandAs(cat_filter, narrowed_left)):pow(2):sum(2):select(2,1):sqrt()
	 --local diff_tensor_cat = torch.add(narrowed_left:clone():sign(), -1, narrowed_right:clone():sign()):cmul(torch.expandAs(cat_filter, narrowed_left)):abs():sign():sum(2):select(2,1)
	 accumulated_distance_per_shift_cat[i+1] = accumulated_distance_per_shift_cat[i+1] + diff_tensor_cat:sum()
      end
      num_samples_accumulated_magnitude = num_samples_accumulated_magnitude + selected_shrink_output:size(1)
      accumulated_magnitude = accumulated_magnitude + selected_shrink_output:clone():pow(2):sum(2):select(2,1):sqrt():sum()
      accumulated_magnitude_part = accumulated_magnitude_part + selected_shrink_output:clone():cmul(torch.expandAs(part_filter, selected_shrink_output)):pow(2):sum(2):select(2,1):sqrt():sum()
      accumulated_magnitude_cat = accumulated_magnitude_cat + selected_shrink_output:clone():cmul(torch.expandAs(cat_filter, selected_shrink_output)):pow(2):sum(2):select(2,1):sqrt():sum()
   end

   function invariance_builder:plot_invariance_scatterplot(opt, encoding_filter, decoding_filter)
      local angle_between_encoder_and_decoder = torch.cdiv(torch.cmul(encoding_filter:t(), decoding_filter):sum(1):select(1,1),
							   torch.cmul(torch.pow(encoding_filter, 2):sum(2):select(2,1):sqrt(),
								      torch.pow(decoding_filter, 2):sum(1):select(1,1):sqrt())):acos()

      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance.png')
      -- y axis is [p(h(x+1) > 0 | h(x) > 0)] / p(h(x) > 0)
      local invariance = torch.cdiv(torch.cdiv(num_iters_conditionally_activated, num_iters_condition_satisfied),
				    torch.div(num_iters_activated, num_iters_processed))
      gnuplot.plot(angle_between_encoder_and_decoder, invariance)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('invariance')
      gnuplot.plotflush()

      local invariance_sorted, sorted_indices_invariance = invariance:sort(true)
      plot_sorted_filters(encoding_filter, decoding_filter, sorted_indices_invariance, opt)

      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance_2.png')
      -- y axis is [p(h(x+1) > 0 | h(x) > 0)] / p(h(x) > 0)
      gnuplot.plot(angle_between_encoder_and_decoder, torch.cdiv(torch.cdiv(num_iters_conditionally_activated_2, num_iters_condition_satisfied_2),
								 torch.div(num_iters_activated, num_iters_processed)))
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('invariance span 2')
      gnuplot.plotflush()


      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance_3.png')
      -- y axis is [p(h(x+1) > 0 | h(x) > 0)] / p(h(x) > 0)
      gnuplot.plot(angle_between_encoder_and_decoder, torch.cdiv(torch.cdiv(num_iters_conditionally_activated_3, num_iters_condition_satisfied_3),
								 torch.div(num_iters_activated, num_iters_processed)))
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('invariance span 3')
      gnuplot.plotflush()


      local progressive_linspace = torch.linspace(1,num_conditional_activation_invariance_shifts,num_conditional_activation_invariance_shifts)
      local fraction_iters_activated = torch.Tensor(1,hidden_layer_size):copy(torch.div(num_iters_activated, num_iters_processed))
      local complete_invariance = torch.cdiv(torch.cdiv(num_iters_conditionally_activated_all, num_iters_condition_satisfied_all),
					 fraction_iters_activated:expandAs(num_iters_conditionally_activated_all))
      local progressive_invariance_part = torch.cmul(complete_invariance, torch.expandAs(part_filter, complete_invariance)):sum(2):select(2,1):div(part_filter:sum())
      local progressive_invariance_cat = torch.cmul(complete_invariance, torch.expandAs(cat_filter, complete_invariance)):sum(2):select(2,1):div(cat_filter:sum())
      local progressive_invariance_full = complete_invariance:mean(2):select(2,1)
      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance_progressive.png')
      -- y axis is [p(h(x+1) > 0 | h(x) > 0)] / p(h(x) > 0)
      --print('sizes are', progressive_invariance_full:size(), progressive_invariance_part:size(), progressive_invariance_cat:size(), progressive_linspace:size())
      gnuplot.plot({'full', progressive_linspace, progressive_invariance_full},
		   {'part', progressive_linspace, progressive_invariance_part},
		   {'cat', progressive_linspace, progressive_invariance_cat})
      gnuplot.xlabel('shift interval')
      gnuplot.ylabel('average invariance')
      gnuplot.plotflush()


      -- if we wanted to be careful, the baseline should really be calculated based upon the difference between the codes of patches drawn from *different* images, rather than shifts along the same image; there are long-range correlations in the statistics within individual images.  However, this does not change the shape of the invariance curve; it just shifts it vertically.  Any similarity that is preserved over a shift of considerably more than the window is probably not very relevant.
      local linspace = torch.linspace(0,max_num_shifts-1,max_num_shifts)
      local avg_distance_per_shift = torch.cdiv(accumulated_distance_per_shift, num_samples_per_shift)
      local avg_distance_per_shift_part = torch.cdiv(accumulated_distance_per_shift_part, num_samples_per_shift)
      local avg_distance_per_shift_cat = torch.cdiv(accumulated_distance_per_shift_cat, num_samples_per_shift)
      local baseline_interval = 30
      local baseline_offset = 20
      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance_rep_diff_avg_over_diff.png')
      gnuplot.plot({'full', linspace, torch.div(avg_distance_per_shift, avg_distance_per_shift:narrow(1,baseline_offset,baseline_interval):mean())},
		   {'part', linspace, torch.div(avg_distance_per_shift_part, avg_distance_per_shift_part:narrow(1,baseline_offset,baseline_interval):mean())},
		   {'cat', linspace, torch.div(avg_distance_per_shift_cat, avg_distance_per_shift_cat:narrow(1,baseline_offset,baseline_interval):mean())})
      gnuplot.xlabel('shift magnitude')
      gnuplot.ylabel('average z difference')
      gnuplot.plotflush()

      local avg_accumulated_magnitude = accumulated_magnitude / num_samples_accumulated_magnitude
      local avg_accumulated_magnitude_part = accumulated_magnitude_part / num_samples_accumulated_magnitude
      local avg_accumulated_magnitude_cat = accumulated_magnitude_cat / num_samples_accumulated_magnitude
      gnuplot.pngfigure(opt.log_directory .. '/scat_invariance_rep_diff_absolute_avg.png')
      gnuplot.plot({'full', linspace, torch.div(avg_distance_per_shift, avg_accumulated_magnitude)},
		   {'part', linspace, torch.div(avg_distance_per_shift_part, avg_accumulated_magnitude_part)},
		   {'cat', linspace, torch.div(avg_distance_per_shift_cat, avg_accumulated_magnitude_cat)})
      gnuplot.xlabel('shift magnitude')
      gnuplot.ylabel('average z difference')
      gnuplot.plotflush()
   end

   return invariance_builder
end

function receptive_field_builder_factory(nExamples, input_size, hidden_layer_size, total_num_shrink_copies, model)
   local accumulated_inputs = {} -- array holding the (unscaled) receptive fields; initialized by the first call to accumulate_weighted_inputs
   local receptive_field_builder = {}
   local shrink_val_tensor = torch.Tensor(total_num_shrink_copies, nExamples, hidden_layer_size) -- output of the shrink nonlinearities for each element of the dataset
   local data_set_tensor = torch.Tensor(nExamples, input_size) -- accumulate the entire dataset used in the diagnostic run; this way, the analysis is correct even if we only present part of the dataset to the model
   local class_tensor = torch.Tensor(nExamples) -- the class should always be a positive integer
   local first_activation, num_activations = torch.Tensor(hidden_layer_size), torch.Tensor(hidden_layer_size)
   local data_set_index = 1 -- present position in the dataset

   -- helper function to build receptive fields
   function receptive_field_builder:accumulate_weighted_inputs(input_tensor, weight_tensor, accumulated_inputs_index)
      if input_tensor:nDimension() == 1 then -- inputs and weights are vectors; we aren't using minibatches
	 if not(accumulated_inputs[accumulated_inputs_index]) then
	    accumulated_inputs[accumulated_inputs_index] = torch.ger(input_tensor, weight_tensor)
	 else
	    accumulated_inputs[accumulated_inputs_index]:addr(input_tensor, weight_tensor)
	 end
      else
	 if not(accumulated_inputs[accumulated_inputs_index]) then
	    accumulated_inputs[accumulated_inputs_index] = torch.mm(input_tensor:t(), weight_tensor)
	 else
	    accumulated_inputs[accumulated_inputs_index]:addmm(input_tensor:t(), weight_tensor)
	 end
      end
   end

   -- this is the interface to the outside world
   function receptive_field_builder:accumulate_shrink_weighted_inputs(new_input, base_shrink, shrink_copies, new_target)
      local batch_size = new_input:size(1)
      if data_set_index >= nExamples then
	 error('accumulated ' .. data_set_index .. ' elements in the receptive field builder, but only expected ' .. nExamples)
      end

      --print(data_set_tensor:size(), data_set_index, batch_size)
      if data_set_index + batch_size - 1 > data_set_tensor:size(1) then -- if the number of elements is not a multiple of the batch size, ensure that the last batch is truncated.  THIS IS PROBABLY INCORRECT FOR RECONSTRUCTION RECEPTIVE FIELDS, BUT IS NECESSARY FOR RECONSTRUCTION CONNECTIONS
	 print('WARNING: truncating batch!  THIS IS PROBABLY INCORRECT IF WE ARE ACTUALLY RECONSTRUCTING RECEPTIVE FIELDS!!!')
	 batch_size = data_set_tensor:size(1) - data_set_index + 1
	 new_input = new_input:narrow(1,1,batch_size)
	 new_target = new_target:narrow(1,1,batch_size)
      end
      data_set_tensor:narrow(1,data_set_index,batch_size):copy(new_input) -- copy the input values from the dataset
      class_tensor:narrow(1,data_set_index,batch_size):copy(new_target)

      self:accumulate_weighted_inputs(new_input, base_shrink.output:narrow(1,1,batch_size), 1) -- accumulate the linear receptive fields
      shrink_val_tensor:select(1,1):narrow(1,data_set_index,batch_size):copy(base_shrink.output:narrow(1,1,batch_size)) -- copy the hidden unit values
      for i = 1,#shrink_copies do
	 self:accumulate_weighted_inputs(new_input, shrink_copies[i].output:narrow(1,1,batch_size), i+1)
	 shrink_val_tensor:select(1,i+1):narrow(1,data_set_index,batch_size):copy(shrink_copies[i].output:narrow(1,1,batch_size))
      end

      data_set_index = data_set_index + batch_size
   end

   function receptive_field_builder:extract_receptive_fields(index)
      local receptive_field_output = accumulated_inputs[index]:clone()
      for i = 1,receptive_field_output:size(2) do
	 local selected_col = receptive_field_output:select(2,i)
	 selected_col:div(selected_col:norm())
      end
      return receptive_field_output
   end

   function receptive_field_builder:plot_receptive_fields(opt, encoding_filter, decoding_filter)
      --shrink_val_tensor:select(2,nExamples+1):zero()
      --data_set_tensor:select(1,nExamples+1):fill(1)

      -- show evolution of optimal dictionaries in a single figure -- hidden_layer_size, total_num_shrink_copies, input_size
      --local optimal_dictionary_matrix = torch.Tensor(shrink_val_tensor:size(3) * shrink_val_tensor:size(1), data_set_tensor:size(2)):zero()
      local optimal_dictionary_matrix = torch.Tensor(hidden_layer_size, total_num_shrink_copies, input_size):zero()
      for i = 1,#accumulated_inputs do -- iterate over shrink copies/hidden layers
	 local receptive_field_output = self:extract_receptive_fields(i)
	 save_filter(receptive_field_output, 'shrink receptive field ' .. i, opt.log_directory)
	 --construct_optimal_dictionary(data_set_tensor, shrink_val_tensor:select(1,i), optimal_dictionary_matrix, i, shrink_val_tensor:size(1), 'shrink dictionary ' .. i, opt.log_directory)
	 construct_optimal_dictionary(data_set_tensor, shrink_val_tensor:select(1,i), optimal_dictionary_matrix:select(2,i))
      end

      local max_val = math.max(math.abs(optimal_dictionary_matrix:min()), math.abs(optimal_dictionary_matrix:max()))
      optimal_dictionary_matrix:mul(-1)
      optimal_dictionary_matrix:add(max_val):div(2*max_val)

      print('total min and max are ' .. optimal_dictionary_matrix:min() .. ', ' .. optimal_dictionary_matrix:max())


      local function categoricalness_enc_dec_alignment(i)
	 local enc = encoding_filter:select(1,i)
	 local dec = decoding_filter:select(2,i)
	 local angle = math.acos(torch.dot(enc, dec)/(enc:norm() * dec:norm()))
	 if angle > cat_thresh then return 'categorical'
	 elseif angle < part_thresh then return 'part'
	 else return 'intermediate' end
      end

      local part_indices, categorical_indices = {}, {}
      local first_part = 6
      local first_categorical = 16

      -- Use indices 6,7,8, restricted to part units, for the part-unit-only figure
      -- Use indices 16,17,18, restricted to categorical units, for the categorical-unit-only figure
      if encoding_filter and decoding_filter then
	 for current_index = 1,encoding_filter:size(1) do
	    if categoricalness_enc_dec_alignment(current_index) == 'part' then
	       if first_part > 1 then first_part = first_part - 1
	       elseif #part_indices < 3 then part_indices[#part_indices + 1] = current_index end
	    elseif categoricalness_enc_dec_alignment(current_index) == 'categorical' then
	       if first_categorical > 1 then first_categorical = first_categorical - 1
	       elseif #categorical_indices < 3 then categorical_indices[#categorical_indices + 1] = current_index end
	    end
	    if (#part_indices >= 20) and (#categorical_indices >= 20) then
	       break
	    end
	 end
      end

      local max_encoder = math.max(math.abs(encoding_filter:min()), math.abs(encoding_filter:max()))
      local max_decoder = math.max(math.abs(decoding_filter:min()), math.abs(decoding_filter:max()))

      -- construct the full image from the composite pieces
      local function make_figure(num_rows, row_mapper, file_name)
	 local filter_side_length = math.sqrt(input_size)
	 local padding = 1
	 local extra_padding = 8
	 local total_extra_padding = 2*extra_padding
	 local xmaps = total_num_shrink_copies + 2
	 local ymaps = num_rows
	 local height = filter_side_length + padding
	 local width = filter_side_length + padding
	 local white_value = 1 --(args.symmetric and math.max(math.abs(args.input:min()),math.abs(args.input:max()))) or args.input:max()
	 local image_out = torch.Tensor(height*ymaps, width*xmaps + total_extra_padding):fill(white_value)

	 for y = 1,ymaps do
	    for x = 1,xmaps do
	       local current_extra_padding = (math.min(x - 1, 1) + math.max(x - (total_num_shrink_copies + 1), 0)) * extra_padding
	       local selected_image_region = image_out:narrow(1,(y-1)*height+1+padding/2,filter_side_length):narrow(2,(x-1)*width+1+padding/2 + current_extra_padding,filter_side_length)
	       local selected_transfer_image
	       if x == 1 then -- flip the color maps and normalize the encoder and decoder filters; we can multiply by -1 before normalizing because the normalization is symmetric around 0
		  selected_transfer_image = encoding_filter:select(1,row_mapper(y)):clone():mul(-1):add(max_encoder):div(2*max_encoder)
	       elseif x == total_num_shrink_copies + 2 then
		  selected_transfer_image = decoding_filter:select(2,row_mapper(y)):clone():mul(-1):add(max_decoder):div(2*max_decoder)
	       else
		  selected_transfer_image = optimal_dictionary_matrix[{row_mapper(y), x-1, {}}]
	       end

	       selected_image_region:copy(selected_transfer_image:unfold(1, filter_side_length, filter_side_length))
	    end
	 end
	 image.savePNG(paths.concat(opt.log_directory, file_name), image_out)
      end

      if #part_indices > 0 then
	 make_figure(#part_indices, function(x) return part_indices[x] end, 'shrink_dictionary_part.png')
      else print('WARNING: NO PART UNITS DETECTED') end
      if #categorical_indices > 0 then
	 make_figure(#categorical_indices, function(x) return categorical_indices[x] end, 'shrink_dictionary_categorical.png')
      else print('WARNING: NO CATEGORICAL UNITS DETECTED') end
      make_figure(hidden_layer_size, function(x) return x end, 'shrink_dictionary.png')
   end

   function receptive_field_builder:plot_reconstruction_connections(opt)
      local input_dim = model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:size(1)
      if (input_dim == 2) or (input_dim == 3)  then -- plot reconstructions only as 2d points
	 plot_reconstruction_connections_2d(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight,
					    ((opt.plot_temporal_reconstructions and shrink_val_tensor) or shrink_val_tensor:select(1,shrink_val_tensor:size(1))),
					    data_set_tensor, class_tensor, opt, 20)
      else -- plot filters, as well as reconstructions, as square bitmaps
	 if opt.plot_temporal_reconstructions then
	    plot_reconstruction_evolution(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight, shrink_val_tensor, data_set_tensor, opt, 20)
	 else
	    plot_reconstruction_connections(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight, shrink_val_tensor:select(1,shrink_val_tensor:size(1)), data_set_tensor, opt, 20)
	 end
      end
   end

   function receptive_field_builder:plot_part_unit_sharing(opt)
      plot_part_sharing_histogram(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				  model.layers[1].module_list.decoding_feature_extraction_dictionary.weight,
				  shrink_val_tensor:select(1,shrink_val_tensor:size(1)), class_tensor, opt)
   end

   function receptive_field_builder:plot_other_figures(opt)
      --[[
      plot_part_sharing_histogram(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				  model.layers[1].module_list.decoding_feature_extraction_dictionary.weight,
				  shrink_val_tensor:select(1,shrink_val_tensor:size(1)), class_tensor, opt)
      --]]
      local activated_at_zero = torch.gt(shrink_val_tensor:select(1,1), 0):double():sum(1):select(1,1)
      local activated_at_one = torch.add(torch.gt(shrink_val_tensor:select(1,2), 0):double(), -1, torch.gt(shrink_val_tensor:select(1,1), 0):double()):maxZero():sum(1):select(1,1)
      local activated_at_end = torch.gt(shrink_val_tensor:select(1,shrink_val_tensor:size(1)), 0):double():sum(1):select(1,1)
      --local activated_after_zero = torch.gt(shrink_val_tensor:narrow(1,2,total_num_shrink_copies-1):sum(1):select(1,1), 0):double():sum(1):select(1,1) -- works since activities are non-negative
      local activated_ever = torch.gt(shrink_val_tensor:sum(1):select(1,1), 0):double():sum(1):select(1,1) -- works since activities are non-negative
      -- activated after zero but not at zero = activated_ever - activated_at_zero
      activated_ever[torch.le(activated_ever, 1)] = 1
      local safe_activated_at_end = activated_at_end:clone()
      safe_activated_at_end[torch.le(activated_at_end, 1)] = 1
      local average_value_when_activated = torch.sum(shrink_val_tensor:select(1,shrink_val_tensor:size(1)), 1):select(1,1):cdiv(safe_activated_at_end)

      local percentage_late_activation = torch.cdiv(torch.add(activated_ever, -1, activated_at_zero), activated_ever)
      local percentage_first_iter_activation = torch.cdiv(activated_at_zero, activated_ever)
      local percentage_second_iter_activation = torch.cdiv(activated_at_one, activated_ever)
      local percentage_activated_at_end = torch.div(activated_at_end, shrink_val_tensor:size(2))
      --print('percentage late activation', percentage_late_activation:unfold(1,10,10))

      local norm_vec = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local enc_norm_vec = torch.Tensor(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight:size(1))
      local dec_norm_vec = torch.Tensor(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:size(2))
      local classification_norm_vec = torch.Tensor(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:size(2))
      local prod_norm_vec = torch.Tensor(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:size(2))
      local ista_ideal_prod = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local ista_ideal_norm_vec = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))

      local average_recurrent_pos_connection_angle = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_neg_connection_angle = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_pos_connection_categoricalness = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_neg_connection_categoricalness = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_total_connection_categoricalness = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_part_connection_angle = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_categorical_connection_angle = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))
      local average_recurrent_categorical_connection_angle_mod = torch.Tensor(model.layers[1].module_list.explaining_away.weight:size(1))

      local deviation_of_recurrent_weight_from_ISTA = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local deviation_of_recurrent_weight_from_ISTA_just_parts_inputs = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local categoricalness_of_recurrent_weight_recipient = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)

      local dot_product_between_decoders_per_connection_from_part_to_part = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local dot_product_between_decoders_per_connection_from_categorical_to_part = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local dot_product_between_decoders_per_connection_from_part_to_categorical = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local dot_product_between_decoders_per_connection_from_categorical_to_categorical = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local angle_between_classifiers_per_connection_from_categorical_to_categorical = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local weight_of_connections_from_part_to_part = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local weight_of_connections_from_categorical_to_part = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local weight_of_connections_from_part_to_categorical = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)
      local weight_of_connections_from_categorical_to_categorical = torch.Tensor(model.layers[1].module_list.explaining_away.weight:nElement()):fill(-100)

      local cwm_pc_num_bins = 100
      local connection_weight_means_part_to_categorical = torch.Tensor(cwm_pc_num_bins):zero()
      local connection_weight_counts_part_to_categorical = torch.Tensor(cwm_pc_num_bins):zero()
      local connection_weight_dot_products_part_to_categorical = torch.linspace(-1,1,cwm_pc_num_bins)

      local ista_ideal_matrix = torch.mm(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:t(), model.layers[1].module_list.decoding_feature_extraction_dictionary.weight):mul(-1) --:add(-1, torch.diag(torch.ones(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:size(2)))) -- NOT NECESSARY since the identity matrix is already added in explicitly

      --torch.diag(torch.mm(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight, model.layers[1].module_list.decoding_feature_extraction_dictionary.weight)),

      for i = 1,model.layers[1].module_list.explaining_away.weight:size(1) do
	 norm_vec[i] = model.layers[1].module_list.explaining_away.weight:select(1,i):norm()
	 enc_norm_vec[i] = model.layers[1].module_list.encoding_feature_extraction_dictionary.weight:select(1,i):norm()
	 dec_norm_vec[i] = model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:select(2,i):norm()
	 classification_norm_vec[i] = model.module_list.classification_dictionary.weight:select(2,i):norm()
	 prod_norm_vec[i] = torch.dot(model.layers[1].module_list.encoding_feature_extraction_dictionary.weight:select(1,i),
				      model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:select(2,i))

	 ista_ideal_prod[i] = torch.dot(model.layers[1].module_list.explaining_away.weight:select(1,i),
				  ista_ideal_matrix:select(1,i))
	 ista_ideal_norm_vec[i] = ista_ideal_matrix:select(1,i):norm()
      end
      --print(norm_vec:unfold(1,10,10))
      local angle_between_encoder_and_decoder = torch.cdiv(prod_norm_vec, torch.cmul(enc_norm_vec, dec_norm_vec)):acos()
      local angle_between_recurrent_input_and_ISTA_ideal = torch.cdiv(ista_ideal_prod, torch.cmul(norm_vec, ista_ideal_norm_vec)):acos()

      for i = 1,model.layers[1].module_list.explaining_away.weight:size(1) do
	 local pos_norm, neg_norm, pos_weighted_sum_angle, neg_weighted_sum_angle, pos_weighted_sum_categoricalness, neg_weighted_sum_categoricalness = 0, 0, 0, 0, 0, 0
	 local part_norm, categorical_norm, part_weighted_sum_angle, categorical_weighted_sum_angle, categorical_weighted_sum_angle_mod = 0, 0, 0, 0, 0
	 local sorted_recurrent_weights = torch.abs(model.layers[1].module_list.explaining_away.weight:select(1,i)):sort()
	 local median_abs_weight = sorted_recurrent_weights[math.ceil(sorted_recurrent_weights:size(1) * (97.5/100))]
	 --print(median_abs_weight)
      	 for j = 1,model.layers[1].module_list.explaining_away.weight:size(2) do
	    local dot_product_between_decoders = torch.dot(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:select(2,i),
							   model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:select(2,j))
	    local angle_between_classifiers = torch.dot(model.module_list.classification_dictionary.weight:select(2,i),
							model.module_list.classification_dictionary.weight:select(2,j)) / (classification_norm_vec[i] * classification_norm_vec[j])

	    local exp_away_linearized_index = j + (i-1)*model.layers[1].module_list.explaining_away.weight:size(2)
	    --deviation_of_recurrent_weight_from_ISTA[exp_away_linearized_index] = math.max(-3, math.min(3, -1 * model.layers[1].module_list.explaining_away.weight[{i,j}] + (1.25/11)*dot_product_between_decoders)) -- - (((i == j) and 1) or 0)))
	    -- plot the ratio between the actual weight and the ISTA-ideal weight, but only for the weights larger than the median, since the ratio is unstable for small weights.  Bound the ratio between -0.5 and 2, so outliers don't disrupt the scale of the plot.  Multiply by -1 since the ista ideal is -1 * dot_preocut_between_decoders
	    deviation_of_recurrent_weight_from_ISTA[exp_away_linearized_index] = math.max(-0.5, math.min(2, -1 * (((math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}]) > median_abs_weight) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}] / dot_product_between_decoders))
	    deviation_of_recurrent_weight_from_ISTA_just_parts_inputs[exp_away_linearized_index] = math.max(-0.5, math.min(2, -1 * (((math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}]) > median_abs_weight) and 1) or 0) * (((angle_between_encoder_and_decoder[j] < 0.55) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}] / dot_product_between_decoders))
	    categoricalness_of_recurrent_weight_recipient[exp_away_linearized_index] = angle_between_encoder_and_decoder[i]

	    local cwm_bin = math.max(1, math.floor(cwm_pc_num_bins * (dot_product_between_decoders + 1) / 2))
	    if (angle_between_encoder_and_decoder[i] > cat_thresh) and (angle_between_encoder_and_decoder[j] < part_thresh) then
	       connection_weight_means_part_to_categorical[cwm_bin] = connection_weight_means_part_to_categorical[cwm_bin] + model.layers[1].module_list.explaining_away.weight[{i,j}]
	       connection_weight_counts_part_to_categorical[cwm_bin] = connection_weight_counts_part_to_categorical[cwm_bin] + 1
	    end

	    dot_product_between_decoders_per_connection_from_part_to_part[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] < part_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] < part_thresh) and 1) or 0) * dot_product_between_decoders
	    weight_of_connections_from_part_to_part[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] < part_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] < part_thresh) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}]

	    dot_product_between_decoders_per_connection_from_categorical_to_part[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] < part_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] > cat_thresh) and 1) or 0) * dot_product_between_decoders
	    weight_of_connections_from_categorical_to_part[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] < part_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] > cat_thresh) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}]

	    dot_product_between_decoders_per_connection_from_part_to_categorical[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] > cat_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] < part_thresh) and 1) or 0) * dot_product_between_decoders
	    weight_of_connections_from_part_to_categorical[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] > cat_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] < part_thresh) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}]

	    dot_product_between_decoders_per_connection_from_categorical_to_categorical[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] > cat_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] > cat_thresh) and 1) or 0) * dot_product_between_decoders
	    weight_of_connections_from_categorical_to_categorical[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] > cat_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] > cat_thresh) and 1) or 0) * model.layers[1].module_list.explaining_away.weight[{i,j}]
	    angle_between_classifiers_per_connection_from_categorical_to_categorical[exp_away_linearized_index] =
	       (((angle_between_encoder_and_decoder[i] > cat_thresh) and 1) or 0) * (((angle_between_encoder_and_decoder[j] > cat_thresh) and 1) or 0) * angle_between_classifiers

	    if i ~= j then -- ignore the diagonal
	       local val_angle = math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}]) *
		  math.acos(dot_product_between_decoders / (dec_norm_vec[i] * dec_norm_vec[j]))
	       local val_categoricalness = math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}]) * angle_between_encoder_and_decoder[j]

	       if model.layers[1].module_list.explaining_away.weight[{i,j}] >= 0 then
		  pos_weighted_sum_angle = pos_weighted_sum_angle + val_angle
		  pos_weighted_sum_categoricalness = pos_weighted_sum_categoricalness + val_categoricalness
		  pos_norm = pos_norm + math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}])
	       else
		  neg_weighted_sum_angle = neg_weighted_sum_angle + val_angle
		  neg_weighted_sum_categoricalness = neg_weighted_sum_categoricalness + val_categoricalness
		  neg_norm = neg_norm + math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}])
	       end

	       if angle_between_encoder_and_decoder[j] < part_thresh then
		  part_weighted_sum_angle = part_weighted_sum_angle +
		     model.layers[1].module_list.explaining_away.weight[{i,j}] * (math.pi/2 - math.acos(dot_product_between_decoders / (dec_norm_vec[i] * dec_norm_vec[j])))
		  part_norm = part_norm + math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}])
	       elseif angle_between_encoder_and_decoder[j] > cat_thresh then
		  categorical_weighted_sum_angle = categorical_weighted_sum_angle +
		     model.layers[1].module_list.explaining_away.weight[{i,j}] * (math.pi/4 - math.acos(dot_product_between_decoders / (dec_norm_vec[i] * dec_norm_vec[j])))
		  categorical_weighted_sum_angle_mod = categorical_weighted_sum_angle_mod +
		     model.layers[1].module_list.explaining_away.weight[{i,j}] * (math.pi/2 - math.acos(dot_product_between_decoders / (dec_norm_vec[i] * dec_norm_vec[j])))
		  categorical_norm = categorical_norm + math.abs(model.layers[1].module_list.explaining_away.weight[{i,j}])
	       end
	    end
	 end
	 pos_norm = (((pos_norm == 0) and 1) or pos_norm)
	 neg_norm = (((neg_norm == 0) and 1) or neg_norm)
	 average_recurrent_pos_connection_angle[i] = pos_weighted_sum_angle / pos_norm
	 average_recurrent_neg_connection_angle[i] = neg_weighted_sum_angle / neg_norm
	 average_recurrent_pos_connection_categoricalness[i] = pos_weighted_sum_categoricalness / pos_norm
	 average_recurrent_neg_connection_categoricalness[i] = neg_weighted_sum_categoricalness / neg_norm
	 average_recurrent_total_connection_categoricalness[i] = (pos_weighted_sum_categoricalness + neg_weighted_sum_categoricalness) / (pos_norm + neg_norm)
	 part_norm = (((part_norm == 0) and 1) or part_norm)
	 categorical_norm = (((categorical_norm == 0) and 1) or categorical_norm)
	 average_recurrent_part_connection_angle[i] = part_weighted_sum_angle / part_norm
	 average_recurrent_categorical_connection_angle[i] = categorical_weighted_sum_angle / categorical_norm
	 average_recurrent_categorical_connection_angle_mod[i] = categorical_weighted_sum_angle_mod / categorical_norm
      end

      connection_weight_counts_part_to_categorical[torch.lt(connection_weight_counts_part_to_categorical, 1)] = 1

      local norm_classification_connection = torch.Tensor(model.module_list.classification_dictionary.weight:size(2))
      for i = 1,model.module_list.classification_dictionary.weight:size(2) do
	 norm_classification_connection[i] = model.module_list.classification_dictionary.weight:select(2,i):norm()
      end


      gnuplot.pngfigure(opt.log_directory .. '/scat_recurrent_weight_match_to_ista_ideal.png')
      gnuplot.plot(angle_between_encoder_and_decoder, angle_between_recurrent_input_and_ISTA_ideal)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('angle between recurrent input and ista ideal')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_prob_of_late_activation.png') -- percentage of inputs for which the unit is activated at some point, but the first activation occurs after the first iteration; versus the magnitude of the recurrent connections; categorical units turn on later, since they have poorly structured encoder inputs but strong connections to part-units.
      gnuplot.plot(angle_between_encoder_and_decoder, percentage_late_activation)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('prob of late activation')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_prob_of_second_iter_activation.png') -- percentage of inputs for which the unit is activated at some point, but the first activation occurs at the second iteration; versus the magnitude of the recurrent connections; categorical units turn on later, since they have poorly structured encoder inputs but strong connections to part-units.
      gnuplot.plot(angle_between_encoder_and_decoder, percentage_second_iter_activation)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('prob of second iter activation')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_prob_active_at_end.png') -- percentage of inputs for which the unit is activated at the end
      gnuplot.plot(angle_between_encoder_and_decoder, percentage_activated_at_end)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('prob activated at end')
      gnuplot.plotflush()


      gnuplot.figure() -- histogram of recurrent connections; categorical units have larger recurrent connections
      gnuplot.hist(norm_vec, 50)

      gnuplot.figure() -- mean recurrent connections versus magnitude of recurrent connections; categorical units have more negative and larger recurrent connections (this is actually a little counterintuitive, since categorical units derive most of their excitation from recurrent connections; presumably, they perform an and-not computation, and there are many units that can veto the activity of a given categorical unit; the nature of this computation will be explicated by plotting the dictionaries of the largest recurrent connections to each unit
      gnuplot.plot(angle_between_encoder_and_decoder,
		   torch.add(model.layers[1].module_list.explaining_away.weight, torch.diag(torch.ones(model.layers[1].module_list.explaining_away.weight:size(1)))):mean(1):select(1,1))
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('recurrent connection mean')

      gnuplot.figure() -- recurrent connection diagonal versus categoricalness
      gnuplot.plot(angle_between_encoder_and_decoder, torch.diag(model.layers[1].module_list.explaining_away.weight))
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('explaining away matrix diagonal')


      --[[
      gnuplot.figure() -- mean recurrent connections excluding diagonal versus magnitude of recurrent connections; categorical units have more negative and larger recurrent connections (this is actually a little counterintuitive, since categorical units derive most of their excitation from recurrent connections; presumably, they perform an and-not computation, and there are many units that can veto the activity of a given categorical unit; the nature of this computation will be explicated by plotting the dictionaries of the largest recurrent connections to each unit
      gnuplot.plot(angle_between_encoder_and_decoder,
		   torch.add(model.layers[1].module_list.explaining_away.weight, -1, torch.diag(torch.diag(model.layers[1].module_list.explaining_away.weight))):mean(1):select(1,1))
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('recurrent connection mean without diagonal')
      --]]

      gnuplot.pngfigure(opt.log_directory .. '/scat_decoder_mean.png') -- mean decoder column versus categoricalness
      gnuplot.plot(angle_between_encoder_and_decoder,
		   model.layers[1].module_list.decoding_feature_extraction_dictionary.weight:mean(1):select(1,1)) -- argument to mean is the dimension collapsed
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('decoder mean')
      gnuplot.plotflush()

      gnuplot.figure() -- mean decoder column versus categoricalness
      gnuplot.plot(angle_between_encoder_and_decoder,
		   model.layers[1].module_list.encoding_feature_extraction_dictionary.weight:mean(2):select(2,1)) -- argument to mean is the dimension collapsed
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('encoder mean')


      gnuplot.pngfigure(opt.log_directory .. '/scat_recurrent_connection_magnitude.png') -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, norm_vec)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('recurrent connection magnitude')
      gnuplot.plotflush()

      gnuplot.figure() -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_pos_connection_angle)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average angle between decoder and positively recurrently connected decoders')

      gnuplot.figure() -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_neg_connection_angle)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average angle between decoder and negatively recurrently connected decoders')

      gnuplot.figure()
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_part_connection_angle)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average angle between decoder and part-restricted decoders')

      gnuplot.figure()
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_categorical_connection_angle)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average angle between decoder and categorical-restricted decoders')

      --[[ this doesn't work as well as the pi/4 version above
      gnuplot.figure()
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_categorical_connection_angle_mod)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average angle between decoder and categorical-restricted decoders - pi/2')
      --]]

      gnuplot.pngfigure(opt.log_directory .. '/scat_classification_dictionary_connection_magnitude.png')
      gnuplot.plot(angle_between_encoder_and_decoder, norm_classification_connection)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('classification dictionary connection magnitude')
      gnuplot.plotflush()


      --[[
      gnuplot.figure() -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_pos_connection_categoricalness)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average categoricalness between decoder and positively recurrently connected decoders')

      gnuplot.figure() -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_neg_connection_categoricalness)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average categoricalness between decoder and negatively recurrently connected decoders')
      --]]

      gnuplot.pngfigure(opt.log_directory .. '/scat_weighted_average_categoricalness.png') -- cos(angle) between encoder and decoder versus magnitude of recurrent input; categorical units have unaligned encoder/decoder pairs and larger recurrent connections
      gnuplot.plot(angle_between_encoder_and_decoder, average_recurrent_total_connection_categoricalness)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('weighted average afferent enc-dec angle')
      gnuplot.plotflush()

      print(angle_between_encoder_and_decoder:unfold(1,10,10))
      print(average_value_when_activated:unfold(1,10,10))
      print(average_value_when_activated:size())

      gnuplot.pngfigure(opt.log_directory .. '/scat_average_final_value_when_activation.png')
      gnuplot.plot(angle_between_encoder_and_decoder, average_value_when_activated)
      gnuplot.xlabel('angle between encoder and decoder')
      gnuplot.ylabel('average final value of unit when activated')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_average_final_value_when_activation_recurrent_connection.png')
      gnuplot.plot(angle_between_recurrent_input_and_ISTA_ideal, average_value_when_activated)
      gnuplot.xlabel('angle between recurrent and ISTA ideal')
      gnuplot.ylabel('average final value of unit when activated')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_class_dict_mag_versus_final_activation.png')
      gnuplot.plot(norm_classification_connection, average_value_when_activated)
      gnuplot.xlabel('classification dictionary column magnitude')
      gnuplot.ylabel('average final value of unit when activated')
      gnuplot.plotflush()

      gnuplot.figure()
      gnuplot.plot(categoricalness_of_recurrent_weight_recipient, deviation_of_recurrent_weight_from_ISTA, '.')
      gnuplot.xlabel('categoricalness of recurrent weight recipient')
      gnuplot.ylabel('ratio between recurrent weight and ISTA ideal')

      --[[
      gnuplot.figure()
      gnuplot.plot(categoricalness_of_recurrent_weight_recipient, deviation_of_recurrent_weight_from_ISTA_just_parts_inputs)
      gnuplot.xlabel('categoricalness of recurrent weight recipient')
      gnuplot.ylabel('ratio between recurrent weight and ISTA ideal restricted to parts inputs')
      --]]

      gnuplot.pngfigure(opt.log_directory .. '/scat_ista_weights_part_to_part.png')
      --gnuplot.figure()
      gnuplot.plot(dot_product_between_decoders_per_connection_from_part_to_part, weight_of_connections_from_part_to_part, '.')
      gnuplot.xlabel('dot product between decoders from part to part')
      gnuplot.ylabel('connection weight')
      gnuplot.plotflush()

      gnuplot.pngfigure(opt.log_directory .. '/scat_ista_weights_categorical_to_part.png')
      --gnuplot.figure()
      gnuplot.plot(dot_product_between_decoders_per_connection_from_categorical_to_part, weight_of_connections_from_categorical_to_part, '.')
      gnuplot.xlabel('dot product between decoders from categorical to part')
      gnuplot.ylabel('connection weight')
      gnuplot.plotflush()

      gnuplot.figure()
      gnuplot.plot(dot_product_between_decoders_per_connection_from_part_to_categorical, weight_of_connections_from_part_to_categorical, '.')
      gnuplot.xlabel('dot product between decoders from part to categorical')
      gnuplot.ylabel('connection weight')

      gnuplot.figure()
      gnuplot.plot(dot_product_between_decoders_per_connection_from_categorical_to_categorical, weight_of_connections_from_categorical_to_categorical, '.')
      gnuplot.xlabel('dot product between decoders from categorical to categorical')
      gnuplot.ylabel('connection weight')

      gnuplot.pngfigure(opt.log_directory .. '/scat_v_diagram.png')
      gnuplot.plot(connection_weight_dot_products_part_to_categorical, connection_weight_means_part_to_categorical:cdiv(connection_weight_counts_part_to_categorical))
      gnuplot.xlabel('dot product between decoders from part to categorical')
      gnuplot.ylabel('average connection weight')
      gnuplot.plotflush()

      gnuplot.figure()
      gnuplot.plot(angle_between_classifiers_per_connection_from_categorical_to_categorical, weight_of_connections_from_categorical_to_categorical, '.')
      gnuplot.xlabel('cos(angle) between classifiers from categorical to categorical')
      gnuplot.ylabel('connection weight')


      --plot_reconstruction_connections(model.layers[1].module_list.decoding_feature_extraction_dictionary.weight, shrink_val_tensor:select(1,shrink_val_tensor:size(1)), data_set_tensor, opt, 20)
      plot_hidden_unit_trajectories(shrink_val_tensor:select(2,1), opt, 400)
      plot_hidden_unit_trajectories(shrink_val_tensor:select(2,1), opt, 400, 1, model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				    model.layers[1].module_list.decoding_feature_extraction_dictionary.weight) -- shrink_val_tensor = torch.Tensor(total_num_shrink_copies, nExamples, hidden_layer_size)
      plot_hidden_unit_trajectories(shrink_val_tensor:select(2,1), opt, 400, -1, model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				    model.layers[1].module_list.decoding_feature_extraction_dictionary.weight) -- shrink_val_tensor = torch.Tensor(total_num_shrink_copies, nExamples, hidden_layer_size)


      --[[
      first_activation:zero()
      num_activations:zero()
      --total_num_shrink_copies, nExamples, hidden_layer_size
      for i = 1,hidden_layer_size do
	 for j = 1,nExamples do
	    for k = 1,total_num_shrink_copies do
	       if shrink_val_tensor[{k,j,i}] > 0 then
		  --first_activation[i] = first_activation[i] + (((k == 1) and 0) or 1) --k-1
		  if k > 1 then
		     first_activation[i] = first_activation[i] + 1
		  end
		  num_activations[i] = num_activations[i] + 1
		  break
	       end
	    end
	 end
      end
      num_activations[torch.le(num_activations, 1)] = 1
      --first_activation:cdiv(num_activations)
      print(torch.cdiv(first_activation, num_activations):unfold(1,10,10))

      print(torch.add(activated_ever, -1, activated_at_zero):unfold(1,10,10))
      print(first_activation:unfold(1,10,10))

      print(activated_ever:unfold(1,10,10))
      print(num_activations:unfold(1,10,10))
      --]]
   end

   function receptive_field_builder:quick_diagnostic_plots(opt)
      for i = 1,shrink_val_tensor:size(2) do
	 plot_hidden_unit_trajectories(shrink_val_tensor:select(2,i), opt, 400, nil, nil, nil, 1)
	 plot_hidden_unit_trajectories(shrink_val_tensor:select(2,i), opt, 400, 1, model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				       model.layers[1].module_list.decoding_feature_extraction_dictionary.weight, 2) -- shrink_val_tensor = torch.Tensor(total_num_shrink_copies, nExamples, hidden_layer_size)
	 plot_hidden_unit_trajectories(shrink_val_tensor:select(2,i), opt, 400, -1, model.layers[1].module_list.encoding_feature_extraction_dictionary.weight,
				       model.layers[1].module_list.decoding_feature_extraction_dictionary.weight, 3) -- shrink_val_tensor = torch.Tensor(total_num_shrink_copies, nExamples,
	 io.read()
      end
   end


   function receptive_field_builder:reset()
      data_set_index = 0
      for i = 1,#accumulated_inputs do
	 accumulated_inputs[i]:zero()
      end
   end

   return receptive_field_builder
end


local function plot_bar(args) -- {bar_length, max_bar_length, image_edge_length, max_decoding, current_column}
   local bar_sign = args.bar_length/math.abs(args.bar_length)
   if args.bar_length > args.max_bar_length then
      print('bar length > max bar length')
      args.bar_length = args.max_bar_length
   end
   for i=1,math.ceil((args.image_edge_length - 2) * math.abs(args.bar_length)/args.max_bar_length) do
      args.current_column[args.image_edge_length + 1 + i] = args.max_decoding * bar_sign
   end
end


-- plot the decoding dictionaries of the top n largest magnitude connections to each unit, scaled by the connection weight.  This gives a sense of how each unit's activation is computed based from the other units.  If restrictions is a table, it is organized like {(rows of fig contain connections from common: source, destination), (restrict source to: any, part, categorical), (restrict dest to: any, part, categorical) (separate by class)}
function plot_explaining_away_connections(encoding_filter, decoding_filter, explaining_away_filter_orig, opt, restrictions, classification_filter, start_display_row, num_display_rows)
   local num_sorted_connections = 20 -- number of connections to show for each unit
   local explaining_away_mag_filter = explaining_away_filter_orig:clone() -- this is used to select which connections to display, and is altered below depending upon the type of connections desired
   local explaining_away_filter = explaining_away_filter_orig:clone() -- make a copy so as to avoid corrupting the original filter
   local file_name, col_type, row_type
   local separate_by_class = false -- reorder each row of the display so that connections of a given class are grouped together.  This makes it apparent if they tend to have the same sign
   local restrict_source_and_dest = false -- don't plot all units and connections; rather based upon the value of restrictions, plot only sources and destinations of particular types
   local dont_restrict_max_on_col = false -- when computing the maximal connection value for scaling the bars on top of the decoders, should the col-restriction be enforced?
   local dont_restrict_max_on_row = false

   if restrictions == 'restrict to positive' then
      explaining_away_mag_filter:maxZero()
      file_name = 'positive sorted recurrent connections'
   elseif type(restrictions) == 'table' then
      local connection_direction_name
      if false and (restrictions[3] == 'categorical') then -- the projections to the categorical units are not ISTA-like, and so the diagonal doesn't have any special meaning
	 print('adding in diagonal')
	 explaining_away_mag_filter:add(torch.diag(torch.ones(explaining_away_filter:size(2))))
	 explaining_away_filter:add(torch.diag(torch.ones(explaining_away_filter:size(2))))
      end