/*************************************************************************\ * Copyright (c) 2010 The University of Chicago, as Operator of Argonne * National Laboratory. * Copyright (c) 2010 The Regents of the University of California, as * Operator of Los Alamos National Laboratory. * This file is distributed subject to a Software License Agreement found * in the file LICENSE that is included with this distribution. \*************************************************************************/ /* program: sddsduplicate * purpose: duplicate rows in a file according to a weight column * * Michael Borland, 2010 $Log: sddsduplicate.c,v $ Revision 1.2 2010/09/10 01:45:09 borland Added -seed and -probabilistic options. If number of rows after duplication is not an integer and -probabilistic is given, the non-integer part is taken as a probability. Revision 1.1 2010/09/08 19:40:41 borland First version. * */ #include "mdb.h" #include "scan.h" #include "SDDS.h" #define SET_WEIGHT 0 #define SET_PIPE 1 #define SET_MAXFACTOR 2 #define SET_MINFACTOR 3 #define SET_FACTOR 4 #define SET_VERBOSITY 5 #define SET_SEED 6 #define SET_PROBABILISTIC 7 #define N_OPTIONS 8 static char *option[N_OPTIONS] = { "weight", "pipe", "maxfactor", "minfactor", "factor", "verbosity", "seed", "probabilistic", }; static char *USAGE="sddsduplicate [] [] [-pipe=[input][,output]]\n\ {-weight= {-minFactor= | -maxFactor=} | -factor=}\n\ [-probabilistic] [-seed=] [-verbosity[=]]\n\ This program duplicates rows in the input file and creates a new file.\n\ The number of duplicates is determined either by a weight column or by a fixed value.\n\ -weight Name of a column to use for weighting the number of duplicates.\n\ -minFactor Minimum number of rows to emit. Results in scaling of weights.\n\ -maxFactor Maximum number of rows to emit. Results in scaling of weights.\n\ In some cases, input rows will not be reflected in the output file\n\ because the weight is less than 1.\n\ -factor Number of duplicates to create. Incompatible with -weight.\n\ -probabilistic\n\ If given, then fractional duplication counts are taken as probabilities.\n\ -seed Set seed for random number generator. By default, seeded by the \n\ system clock.\n\ -verbosity Set verbosity level.\n\n\ Program by Michael Borland. (This is version 1, September 2010.)\n"; int main(int argc, char **argv) { SDDS_DATASET SDDS_input, SDDS_output; char *inputfile, *outputfile; long i, i_arg, verbosity; SCANNED_ARG *s_arg; unsigned long pipeFlags; char *weightColumnName; double *weightData, minWeight, maxWeight; double *dupValue; long maxFactor, minFactor, dupRows; long j, inputRows, storedRows, randomNumberSeed; short probabilistic = 0; SDDS_RegisterProgramName(argv[0]); argc = scanargs(&s_arg, argc, argv); if (argc<3) bomb(NULL, USAGE); weightColumnName = NULL; maxFactor = minFactor = dupRows = 0; inputfile = NULL; outputfile = NULL; pipeFlags = 0; verbosity = 0; argc = scanargs(&s_arg, argc, argv); if (argc<3) bomb(NULL, USAGE); for (i_arg=1; i_arg0) { inputRows = SDDS_RowCount(&SDDS_input); if (inputRows) { dupValue = tmalloc(sizeof(*dupValue)*inputRows); if (weightColumnName) { if (!(weightData = SDDS_GetColumnInDoubles(&SDDS_input, weightColumnName))) SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors|SDDS_EXIT_PrintErrors); if (minFactor) { find_min_max(&minWeight, &maxWeight, weightData, inputRows); if (minWeight<=0) SDDS_Bomb("Minimum weight value is nonpositive. Can't use -minFactor."); for (i=0; irandom_1(0)) dupValue[i] += 1; } } else for (i=0; idupValue[i]) minDup = dupValue[i]; } fprintf(stderr, "%ld output rows, mininum and maximum duplication factor: %ld, %ld\n", storedRows, minDup, maxDup); } if (!SDDS_StartPage(&SDDS_output, storedRows) || !SDDS_CopyParameters(&SDDS_output, &SDDS_input) || !SDDS_CopyArrays(&SDDS_output, &SDDS_input)) SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors|SDDS_EXIT_PrintErrors); storedRows = 0; for (i=0; i