Friday 15 August 2014

join - RapidMiner Union over Collection (of Data Tables) -



join - RapidMiner Union over Collection (of Data Tables) -

i having hard time finding solution doing union on collection of info tables in rapid miner (6). have collection (in case don't know it, imagine array of objects) contains variable number of info tables, each having @ to the lowest degree 1 "id" column same id + column different name in each. want merge these in union (aka join), have records of form [id, column_from_data_table_1, column_from_data_table2, ...].

for example, collection contain number n of info tables of form:

table 1 id col1 1 0.5 2 0.7 table 2 id col2 1 0.1 2 0.0 ........ ........ ........ table n id coln 1 0.0 2 0.8

and @ end, union (join) of tables in collection should this:

result id col1 col2 ... coln 1 0.5 0.1 ... 0.0 2 0.7 0.0 ... 0.8

note each table has same number of records , same ids assigned them (+ column names across tables, except id, unique - in other words, info couldn't more ideal this).

quite advanced process involving remember, recall, branch, join, select , loop collection. here's cutting downwards example.

<?xml version="1.0" encoding="utf-8" standalone="no"?> <process version="6.1.008"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="6.1.000-snapshot" expanded="true" name="process"> <process expanded="true"> <operator activated="true" class="subprocess" compatibility="6.1.000-snapshot" expanded="true" height="76" name="subprocess" width="90" x="112" y="30"> <process expanded="true"> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification" width="90" x="45" y="30"> <list key="attribute_values"> <parameter key="id" value="1"/> <parameter key="col1" value="48"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification (2)" width="90" x="45" y="120"> <list key="attribute_values"> <parameter key="id" value="2"/> <parameter key="col1" value="4"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="append" compatibility="6.1.000-snapshot" expanded="true" height="94" name="append" width="90" x="179" y="30"/> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification (3)" width="90" x="45" y="210"> <list key="attribute_values"> <parameter key="id" value="1"/> <parameter key="col2" value="9"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification (4)" width="90" x="45" y="300"> <list key="attribute_values"> <parameter key="id" value="2"/> <parameter key="col2" value="7"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="append" compatibility="6.1.000-snapshot" expanded="true" height="94" name="append (2)" width="90" x="179" y="210"/> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification (5)" width="90" x="45" y="390"> <list key="attribute_values"> <parameter key="id" value="1"/> <parameter key="col3" value="88"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-snapshot" expanded="true" height="60" name="generate info user specification (6)" width="90" x="45" y="480"> <list key="attribute_values"> <parameter key="id" value="2"/> <parameter key="col3" value="78"/> </list> <list key="set_additional_roles"> <parameter key="id" value="id"/> </list> </operator> <operator activated="true" class="append" compatibility="6.1.000-snapshot" expanded="true" height="94" name="append (3)" width="90" x="179" y="390"/> <operator activated="true" class="collect" compatibility="6.1.000-snapshot" expanded="true" height="112" name="collect" width="90" x="380" y="210"/> <connect from_op="generate info user specification" from_port="output" to_op="append" to_port="example set 1"/> <connect from_op="generate info user specification (2)" from_port="output" to_op="append" to_port="example set 2"/> <connect from_op="append" from_port="merged set" to_op="collect" to_port="input 1"/> <connect from_op="generate info user specification (3)" from_port="output" to_op="append (2)" to_port="example set 1"/> <connect from_op="generate info user specification (4)" from_port="output" to_op="append (2)" to_port="example set 2"/> <connect from_op="append (2)" from_port="merged set" to_op="collect" to_port="input 2"/> <connect from_op="generate info user specification (5)" from_port="output" to_op="append (3)" to_port="example set 1"/> <connect from_op="generate info user specification (6)" from_port="output" to_op="append (3)" to_port="example set 2"/> <connect from_op="append (3)" from_port="merged set" to_op="collect" to_port="input 3"/> <connect from_op="collect" from_port="collection" to_port="out 1"/> <portspacing port="source_in 1" spacing="0"/> <portspacing port="sink_out 1" spacing="0"/> <portspacing port="sink_out 2" spacing="0"/> </process> </operator> <operator activated="true" class="multiply" compatibility="6.1.000-snapshot" expanded="true" height="94" name="multiply (2)" width="90" x="246" y="30"/> <operator activated="true" class="select" compatibility="6.1.000-snapshot" expanded="true" height="60" name="select (2)" width="90" x="447" y="30"/> <operator activated="true" class="remember" compatibility="6.1.000-snapshot" expanded="true" height="60" name="remember" width="90" x="581" y="30"> <parameter key="name" value="1"/> </operator> <operator activated="true" class="loop_collection" compatibility="6.1.000-snapshot" expanded="true" height="76" name="loop collection" width="90" x="447" y="165"> <parameter key="set_iteration_macro" value="true"/> <process expanded="true"> <operator activated="true" class="branch" compatibility="6.1.000-snapshot" expanded="true" height="76" name="branch" width="90" x="112" y="120"> <parameter key="condition_type" value="expression"/> <parameter key="condition_value" value="%{iteration}==1"/> <process expanded="true"> <connect from_port="condition" to_port="input 1"/> <portspacing port="source_condition" spacing="0"/> <portspacing port="source_input 1" spacing="0"/> <portspacing port="sink_input 1" spacing="0"/> <portspacing port="sink_input 2" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="recall" compatibility="6.1.000-snapshot" expanded="true" height="60" name="recall" width="90" x="112" y="75"> <parameter key="name" value="1"/> </operator> <operator activated="true" class="join" compatibility="6.1.000-snapshot" expanded="true" height="76" name="join" width="90" x="246" y="30"> <list key="key_attributes"/> </operator> <operator activated="true" class="remember" compatibility="6.1.000-snapshot" expanded="true" height="60" name="remember (2)" width="90" x="380" y="30"> <parameter key="name" value="1"/> </operator> <connect from_port="condition" to_op="join" to_port="left"/> <connect from_op="recall" from_port="result" to_op="join" to_port="right"/> <connect from_op="join" from_port="join" to_op="remember (2)" to_port="store"/> <connect from_op="remember (2)" from_port="stored" to_port="input 1"/> <portspacing port="source_condition" spacing="0"/> <portspacing port="source_input 1" spacing="0"/> <portspacing port="sink_input 1" spacing="0"/> <portspacing port="sink_input 2" spacing="0"/> </process> </operator> <connect from_port="single" to_op="branch" to_port="condition"/> <connect from_op="branch" from_port="input 1" to_port="output 1"/> <portspacing port="source_single" spacing="0"/> <portspacing port="sink_output 1" spacing="0"/> <portspacing port="sink_output 2" spacing="0"/> </process> </operator> <operator activated="true" class="recall" compatibility="6.1.000-snapshot" expanded="true" height="60" name="recall (2)" width="90" x="581" y="165"> <parameter key="name" value="1"/> </operator> <connect from_op="subprocess" from_port="out 1" to_op="multiply (2)" to_port="input"/> <connect from_op="multiply (2)" from_port="output 1" to_op="select (2)" to_port="collection"/> <connect from_op="multiply (2)" from_port="output 2" to_op="loop collection" to_port="collection"/> <connect from_op="select (2)" from_port="selected" to_op="remember" to_port="store"/> <connect from_op="recall (2)" from_port="result" to_port="result 1"/> <portspacing port="source_input 1" spacing="0"/> <portspacing port="sink_result 1" spacing="0"/> <portspacing port="sink_result 2" spacing="0"/> </process> </operator> </process>

hope helps.

join collections union rapidminer

No comments:

Post a Comment