-
-
Save monperrus/7157717 to your computer and use it in GitHub Desktop.
| import java.io.*; | |
| import java.util.*; | |
| /** The class encapsulates an implementation of the Apriori algorithm | |
| * to compute frequent itemsets. | |
| * | |
| * Datasets contains integers (>=0) separated by spaces, one transaction by line, e.g. | |
| * 1 2 3 | |
| * 0 9 | |
| * 1 9 | |
| * | |
| * Usage with the command line : | |
| * $ java mining.Apriori fileName support | |
| * $ java mining.Apriori /tmp/data.dat 0.8 | |
| * $ java mining.Apriori /tmp/data.dat 0.8 > frequent-itemsets.txt | |
| * | |
| * For a full library, see SPMF https://www.philippe-fournier-viger.com/spmf/ | |
| * | |
| * @author Martin Monperrus, University of Darmstadt, 2010 | |
| * @author Nathan Magnus and Su Yibin, under the supervision of Howard Hamilton, University of Regina, June 2009. | |
| * @copyright GNU General Public License v3 | |
| * No reproduction in whole or part without maintaining this copyright notice | |
| * and imposing this condition on any subsequent users. | |
| */ | |
| public class Apriori extends Observable { | |
| public static void main(String[] args) throws Exception { | |
| Apriori ap = new Apriori(args); | |
| } | |
| /** the list of current itemsets */ | |
| private List<int[]> itemsets ; | |
| /** the name of the transcation file */ | |
| private String transaFile; | |
| /** number of different items in the dataset */ | |
| private int numItems; | |
| /** total number of transactions in transaFile */ | |
| private int numTransactions; | |
| /** minimum support for a frequent itemset in percentage, e.g. 0.8 */ | |
| private double minSup; | |
| /** by default, Apriori is used with the command line interface */ | |
| private boolean usedAsLibrary = false; | |
| /** This is the main interface to use this class as a library */ | |
| public Apriori(String[] args, Observer ob) throws Exception | |
| { | |
| usedAsLibrary = true; | |
| configure(args); | |
| this.addObserver(ob); | |
| go(); | |
| } | |
| /** generates the apriori itemsets from a file | |
| * | |
| * @param args configuration parameters: args[0] is a filename, args[1] the min support (e.g. 0.8 for 80%) | |
| */ | |
| public Apriori(String[] args) throws Exception | |
| { | |
| configure(args); | |
| go(); | |
| } | |
| /** starts the algorithm after configuration */ | |
| private void go() throws Exception { | |
| //start timer | |
| long start = System.currentTimeMillis(); | |
| // first we generate the candidates of size 1 | |
| createItemsetsOfSize1(); | |
| int itemsetNumber=1; //the current itemset being looked at | |
| int nbFrequentSets=0; | |
| while (itemsets.size()>0) | |
| { | |
| calculateFrequentItemsets(); | |
| if(itemsets.size()!=0) | |
| { | |
| nbFrequentSets+=itemsets.size(); | |
| log("Found "+itemsets.size()+" frequent itemsets of size " + itemsetNumber + " (with support "+(minSup*100)+"%)");; | |
| createNewItemsetsFromPreviousOnes(); | |
| } | |
| itemsetNumber++; | |
| } | |
| //display the execution time | |
| long end = System.currentTimeMillis(); | |
| log("Execution time is: "+((double)(end-start)/1000) + " seconds."); | |
| log("Found "+nbFrequentSets+ " frequents sets for support "+(minSup*100)+"% (absolute "+Math.round(numTransactions*minSup)+")"); | |
| log("Done"); | |
| } | |
| /** triggers actions if a frequent item set has been found */ | |
| private void foundFrequentItemSet(int[] itemset, int support) { | |
| if (usedAsLibrary) { | |
| this.setChanged(); | |
| notifyObservers(itemset); | |
| } | |
| else {System.out.println(Arrays.toString(itemset) + " ("+ ((support / (double) numTransactions))+" "+support+")");} | |
| } | |
| /** outputs a message in Sys.err if not used as library */ | |
| private void log(String message) { | |
| if (!usedAsLibrary) { | |
| System.err.println(message); | |
| } | |
| } | |
| /** computes numItems, numTransactions, and sets minSup */ | |
| private void configure(String[] args) throws Exception | |
| { | |
| // setting transafile | |
| if (args.length!=0) transaFile = args[0]; | |
| else transaFile = "chess.dat"; // default | |
| // setting minsupport | |
| if (args.length>=2) minSup=(Double.valueOf(args[1]).doubleValue()); | |
| else minSup = .8;// by default | |
| if (minSup>1 || minSup<0) throw new Exception("minSup: bad value"); | |
| // going thourgh the file to compute numItems and numTransactions | |
| numItems = 0; | |
| numTransactions=0; | |
| BufferedReader data_in = new BufferedReader(new FileReader(transaFile)); | |
| while (data_in.ready()) { | |
| String line=data_in.readLine(); | |
| if (line.matches("\\s*")) continue; // be friendly with empty lines | |
| numTransactions++; | |
| StringTokenizer t = new StringTokenizer(line," "); | |
| while (t.hasMoreTokens()) { | |
| int x = Integer.parseInt(t.nextToken()); | |
| //log(x); | |
| if (x+1>numItems) numItems=x+1; | |
| } | |
| } | |
| outputConfig(); | |
| } | |
| /** outputs the current configuration | |
| */ | |
| private void outputConfig() { | |
| //output config info to the user | |
| log("Input configuration: "+numItems+" items, "+numTransactions+" transactions, "); | |
| log("minsup = "+minSup*100+"%"); | |
| } | |
| /** puts in itemsets all sets of size 1, | |
| * i.e. all possibles items of the datasets | |
| */ | |
| private void createItemsetsOfSize1() { | |
| itemsets = new ArrayList<int[]>(); | |
| for(int i=0; i<numItems; i++) | |
| { | |
| int[] cand = {i}; | |
| itemsets.add(cand); | |
| } | |
| } | |
| /** | |
| * if m is the size of the current itemsets, | |
| * generate all possible itemsets of size n+1 from pairs of current itemsets | |
| * replaces the itemsets of itemsets by the new ones | |
| */ | |
| private void createNewItemsetsFromPreviousOnes() | |
| { | |
| // by construction, all existing itemsets have the same size | |
| int currentSizeOfItemsets = itemsets.get(0).length; | |
| log("Creating itemsets of size "+(currentSizeOfItemsets+1)+" based on "+itemsets.size()+" itemsets of size "+currentSizeOfItemsets); | |
| HashMap<String, int[]> tempCandidates = new HashMap<String, int[]>(); //temporary candidates | |
| // compare each pair of itemsets of size n-1 | |
| for(int i=0; i<itemsets.size(); i++) | |
| { | |
| for(int j=i+1; j<itemsets.size(); j++) | |
| { | |
| int[] X = itemsets.get(i); | |
| int[] Y = itemsets.get(j); | |
| assert (X.length==Y.length); | |
| //make a string of the first n-2 tokens of the strings | |
| int [] newCand = new int[currentSizeOfItemsets+1]; | |
| for(int s=0; s<newCand.length-1; s++) { | |
| newCand[s] = X[s]; | |
| } | |
| int ndifferent = 0; | |
| // then we find the missing value | |
| for(int s1=0; s1<Y.length; s1++) | |
| { | |
| boolean found = false; | |
| // is Y[s1] in X? | |
| for(int s2=0; s2<X.length; s2++) { | |
| if (X[s2]==Y[s1]) { | |
| found = true; | |
| break; | |
| } | |
| } | |
| if (!found){ // Y[s1] is not in X | |
| ndifferent++; | |
| // we put the missing value at the end of newCand | |
| newCand[newCand.length -1] = Y[s1]; | |
| } | |
| } | |
| // we have to find at least 1 different, otherwise it means that we have two times the same set in the existing candidates | |
| assert(ndifferent>0); | |
| if (ndifferent==1) { | |
| // HashMap does not have the correct "equals" for int[] :-( | |
| // I have to create the hash myself using a String :-( | |
| // I use Arrays.toString to reuse equals and hashcode of String | |
| Arrays.sort(newCand); | |
| tempCandidates.put(Arrays.toString(newCand),newCand); | |
| } | |
| } | |
| } | |
| //set the new itemsets | |
| itemsets = new ArrayList<int[]>(tempCandidates.values()); | |
| log("Created "+itemsets.size()+" unique itemsets of size "+(currentSizeOfItemsets+1)); | |
| } | |
| /** put "true" in trans[i] if the integer i is in line */ | |
| private void line2booleanArray(String line, boolean[] trans) { | |
| Arrays.fill(trans, false); | |
| StringTokenizer stFile = new StringTokenizer(line, " "); //read a line from the file to the tokenizer | |
| //put the contents of that line into the transaction array | |
| while (stFile.hasMoreTokens()) | |
| { | |
| int parsedVal = Integer.parseInt(stFile.nextToken()); | |
| trans[parsedVal]=true; //if it is not a 0, assign the value to true | |
| } | |
| } | |
| /** passes through the data to measure the frequency of sets in {@link itemsets}, | |
| * then filters thoses who are under the minimum support (minSup) | |
| */ | |
| private void calculateFrequentItemsets() throws Exception | |
| { | |
| log("Passing through the data to compute the frequency of " + itemsets.size()+ " itemsets of size "+itemsets.get(0).length); | |
| List<int[]> frequentCandidates = new ArrayList<int[]>(); //the frequent candidates for the current itemset | |
| boolean match; //whether the transaction has all the items in an itemset | |
| int count[] = new int[itemsets.size()]; //the number of successful matches, initialized by zeros | |
| // load the transaction file | |
| BufferedReader data_in = new BufferedReader(new InputStreamReader(new FileInputStream(transaFile))); | |
| boolean[] trans = new boolean[numItems]; | |
| // for each transaction | |
| for (int i = 0; i < numTransactions; i++) { | |
| // boolean[] trans = extractEncoding1(data_in.readLine()); | |
| String line = data_in.readLine(); | |
| line2booleanArray(line, trans); | |
| // check each candidate | |
| for (int c = 0; c < itemsets.size(); c++) { | |
| match = true; // reset match to false | |
| // tokenize the candidate so that we know what items need to be | |
| // present for a match | |
| int[] cand = itemsets.get(c); | |
| //int[] cand = candidatesOptimized[c]; | |
| // check each item in the itemset to see if it is present in the | |
| // transaction | |
| for (int xx : cand) { | |
| if (trans[xx] == false) { | |
| match = false; | |
| break; | |
| } | |
| } | |
| if (match) { // if at this point it is a match, increase the count | |
| count[c]++; | |
| //log(Arrays.toString(cand)+" is contained in trans "+i+" ("+line+")"); | |
| } | |
| } | |
| } | |
| data_in.close(); | |
| for (int i = 0; i < itemsets.size(); i++) { | |
| // if the count% is larger than the minSup%, add to the candidate to | |
| // the frequent candidates | |
| if ((count[i] / (double) (numTransactions)) >= minSup) { | |
| foundFrequentItemSet(itemsets.get(i),count[i]); | |
| frequentCandidates.add(itemsets.get(i)); | |
| } | |
| //else log("-- Remove candidate: "+ Arrays.toString(candidates.get(i)) + " is: "+ ((count[i] / (double) numTransactions))); | |
| } | |
| //new candidates are only the frequent candidates | |
| itemsets = frequentCandidates; | |
| } | |
| } |
Usage as library: see {@link ExampleOfClientCodeOfApriori} where is this example?
@monperrus Everyone, be aware with the usage of the code. The code assumes that your transactions DB contains records all from 0 to n. If your records don't start with 0, e..g [209 212 209 212 212 212; 45 63 89; 89 53 63], above code will not work. The author should make appropriate changes in config function.
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java
@ENGINEER-RC thanks bro
I need the description of the data set "retail.gz " available in the link "http://fimi.ua.ac.be/data/." please help me.
how can i add chess.dat in netbeans?
Thanks so much for sharing! However I found a typo in the code, specifically, line 151
log("minsup = "+minSup+"%");
should be
log("minsup = "+minSup*100+"%");
I found a typo in the code, specifically, line 151
Thanks for typo! Fixed.
For a full library, see SPMF https://www.philippe-fournier-viger.com/spmf/
i want the same code but only for strings not only integers any help?
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.java
can you please share your file again? It seems that it doesn't exist anymore
I forked this code and added association rules to it enjoy ;)
My Forked Apriori.javacan you please share your file again? It seems that it doesn't exist anymore
https://gist.github.com/tomrockdsouza/34bdc63161befad19ce33564a473fc58
Please provide me code for reverse apriori algorithm in R or java
sathyamphil2016@gmail.com this is my mail id