信息提取工作流指令文件用来驱动DataScraper的工作流引擎,该文件记录了为完成某个主题的Web信息提取任务所有需要的工作流处理器和执行顺序。该文件是一个XML文件,文件名后缀是.profile.xml。下面是一个例子:
<?xml version="1.0"?>
<geometa-session-profile>
<theme>主题名</theme> <!-- 主题名必须正确 -->
<processor id="MigrateWorksBucket">
<class>com.geometa.spider.processor.MigrateWorksBucket</class>
</processor>
<processor id="FetchSpiderClue">
<class>com.geometa.spider.processor.FetchSpiderClue</class>
</processor>
<processor id="LoadHtmlPage">
<class>com.geometa.spider.processor.LoadHtmlPage</class>
</processor>
<processor id="FindDataSchema_Plain">
<class>com.geometa.spider.processor.FindDataSchema_Plain</class>
</processor>
<processor id="ExtractWebNodeData_Simp">
<class>com.geometa.spider.processor.ExtractWebNodeData_Simp</class>
</processor>
<processor id="ValidateExtraction">
<class>com.geometa.spider.processor.ValidateExtraction</class>
</processor>
<processor id="SaveFile_Simp">
<class>com.geometa.spider.processor.SaveFile_Simp</class>
</processor>
<processor id="ExtractSpiderClue_Simp">
<class>com.geometa.spider.processor.ExtractSpiderClue_Simp</class>
</processor>
<processor id="ConfirmSpiderClue_Simp">
<class>com.geometa.spider.processor.ConfirmSpiderClue_Simp</class>
</processor>
<processor id="CleanWorksBucket">
<class>com.geometa.spider.processor.CleanWorksBucket</class>
</processor>
</geometa-session-profile>