public class GeneratorJob extends NutchTool implements Tool
| Modifier and Type | Class and Description |
|---|---|
static class |
GeneratorJob.SelectorEntry |
static class |
GeneratorJob.SelectorEntryComparator |
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
BATCH_ID |
static java.lang.String |
GENERATE_COUNT |
static java.lang.String |
GENERATE_UPDATE_CRAWLDB |
static java.lang.String |
GENERATOR_COUNT_MODE |
static java.lang.String |
GENERATOR_COUNT_VALUE_DOMAIN |
static java.lang.String |
GENERATOR_COUNT_VALUE_HOST |
static java.lang.String |
GENERATOR_COUNT_VALUE_IP |
static java.lang.String |
GENERATOR_CUR_TIME |
static java.lang.String |
GENERATOR_DELAY |
static java.lang.String |
GENERATOR_FILTER |
static java.lang.String |
GENERATOR_MAX_COUNT |
static java.lang.String |
GENERATOR_MIN_SCORE |
static java.lang.String |
GENERATOR_NORMALISE |
static java.lang.String |
GENERATOR_RANDOM_SEED |
static java.lang.String |
GENERATOR_SITEMAP |
static java.lang.String |
GENERATOR_TOP_N |
protected static org.slf4j.Logger |
LOG |
currentJob, currentJobNum, numJobs, results, status| Constructor and Description |
|---|
GeneratorJob() |
GeneratorJob(Configuration conf) |
| Modifier and Type | Method and Description |
|---|---|
java.lang.String |
generate(long topN,
long curTime,
boolean filter,
boolean norm,
boolean sitemap)
Mark URLs ready for fetching.
|
java.util.Collection<WebPage.Field> |
getFields(Job job) |
static void |
main(java.lang.String[] args) |
static java.lang.String |
randomBatchId()
Generates a random batch id
|
java.util.Map<java.lang.String,java.lang.Object> |
run(java.util.Map<java.lang.String,java.lang.Object> args)
Runs generator
|
int |
run(java.lang.String[] args)
Runs generator from commandline
|
getProgress, getStatus, killJob, stopJobgetConf, setConfclone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, waitgetConf, setConfpublic static final java.lang.String GENERATE_UPDATE_CRAWLDB
public static final java.lang.String GENERATOR_MIN_SCORE
public static final java.lang.String GENERATOR_FILTER
public static final java.lang.String GENERATOR_NORMALISE
public static final java.lang.String GENERATOR_SITEMAP
public static final java.lang.String GENERATOR_MAX_COUNT
public static final java.lang.String GENERATOR_COUNT_MODE
public static final java.lang.String GENERATOR_COUNT_VALUE_DOMAIN
public static final java.lang.String GENERATOR_COUNT_VALUE_HOST
public static final java.lang.String GENERATOR_COUNT_VALUE_IP
public static final java.lang.String GENERATOR_TOP_N
public static final java.lang.String GENERATOR_CUR_TIME
public static final java.lang.String GENERATOR_DELAY
public static final java.lang.String GENERATOR_RANDOM_SEED
public static final java.lang.String BATCH_ID
public static final java.lang.String GENERATE_COUNT
protected static final org.slf4j.Logger LOG
public GeneratorJob()
public GeneratorJob(Configuration conf)
public java.util.Collection<WebPage.Field> getFields(Job job)
public static java.lang.String randomBatchId()
public java.util.Map<java.lang.String,java.lang.Object> run(java.util.Map<java.lang.String,java.lang.Object> args)
throws java.lang.Exception
public java.lang.String generate(long topN,
long curTime,
boolean filter,
boolean norm,
boolean sitemap)
throws java.lang.Exception
topN - top threshold for maximum number of URLs permitted in a batchcurTime - the current time in millisecondsfilter - optional filtering of URLs within the generated batchnorm - optional normalization of URls within the generated batchsitemap - flag indicating whether a URL is a sitemap and hence processed accordinglyjava.lang.Exceptionpublic int run(java.lang.String[] args)
throws java.lang.Exception
public static void main(java.lang.String[] args)
throws java.lang.Exception
java.lang.ExceptionCopyright © 2019 The Apache Software Foundation